From bd0efcaa34b1ddfa85b6e8ef341266c0059c4791 Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Fri, 31 Oct 2025 13:33:34 -0700
Subject: [PATCH 01/20] [lldb-dap] Correctly trigger 'entry' stop reasons.
 (#165901)

Noticed this while looking into test stability that the 'entry' stop
reason is not triggering correctly. This should ensure we correctly
trigger the 'entry' stop reason when launching a process with
`"stopOnEntry": true`. I've also updated the tests to ensure we receive
the 'entry' stop reason to catch this regression.
---
 .../test/tools/lldb-dap/lldbdap_testcase.py   | 10 ++++++
 .../tools/lldb-dap/restart/TestDAP_restart.py | 28 ++---------------
 .../restart/TestDAP_restart_console.py        | 31 ++-----------------
 lldb/tools/lldb-dap/EventHelper.cpp           |  2 +-
 lldb/tools/lldb-dap/JSONUtils.cpp             |  2 +-
 5 files changed, 18 insertions(+), 55 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index 29935bb8046ff..c6c4a3e2a4e1e 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -223,6 +223,16 @@ def verify_stop_exception_info(self, expected_description):
                     return True
         return False
 
+    def verify_stop_on_entry(self) -> None:
+        """Waits for the process to be stopped and then verifies at least one
+        thread has the stop reason 'entry'."""
+        self.dap_server.wait_for_stopped()
+        self.assertIn(
+            "entry",
+            (t["reason"] for t in self.dap_server.thread_stop_reasons.values()),
+            "Expected at least one thread to report stop reason 'entry' in {self.dap_server.thread_stop_reasons}",
+        )
+
     def verify_commands(self, flavor: str, output: str, commands: list[str]):
         self.assertTrue(output and len(output) > 0, "expect console output")
         lines = output.splitlines()
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
index 83faf276852f8..e8e07e1e86fc4 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
@@ -51,20 +51,8 @@ def test_stopOnEntry(self):
         self.build_and_launch(program, stopOnEntry=True)
         [bp_main] = self.set_function_breakpoints(["main"])
 
-        self.dap_server.request_configurationDone()
-        self.dap_server.wait_for_stopped()
-        # Once the "configuration done" event is sent, we should get a stopped
-        # event immediately because of stopOnEntry.
-        self.assertTrue(
-            len(self.dap_server.thread_stop_reasons) > 0,
-            "expected stopped event during launch",
-        )
-        for _, body in self.dap_server.thread_stop_reasons.items():
-            if "reason" in body:
-                reason = body["reason"]
-                self.assertNotEqual(
-                    reason, "breakpoint", 'verify stop isn\'t "main" breakpoint'
-                )
+        self.continue_to_next_stop()
+        self.verify_stop_on_entry()
 
         # Then, if we continue, we should hit the breakpoint at main.
         self.continue_to_breakpoints([bp_main])
@@ -73,17 +61,7 @@ def test_stopOnEntry(self):
         # main.
         resp = self.dap_server.request_restart()
         self.assertTrue(resp["success"])
-        stopped_events = self.dap_server.wait_for_stopped()
-        for stopped_event in stopped_events:
-            if "body" in stopped_event:
-                body = stopped_event["body"]
-                if "reason" in body:
-                    reason = body["reason"]
-                    self.assertNotEqual(
-                        reason,
-                        "breakpoint",
-                        'verify stop after restart isn\'t "main" breakpoint',
-                    )
+        self.verify_stop_on_entry()
 
     @skipIfWindows
     def test_arguments(self):
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
index e1ad1425a993d..7d4949907df0d 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_console.py
@@ -11,27 +11,6 @@
 
 @skipIfBuildType(["debug"])
 class TestDAP_restart_console(lldbdap_testcase.DAPTestCaseBase):
-    def verify_stopped_on_entry(self, stopped_events: List[Dict[str, Any]]):
-        seen_stopped_event = 0
-        for stopped_event in stopped_events:
-            body = stopped_event.get("body")
-            if body is None:
-                continue
-
-            reason = body.get("reason")
-            if reason is None:
-                continue
-
-            self.assertNotEqual(
-                reason,
-                "breakpoint",
-                'verify stop after restart isn\'t "main" breakpoint',
-            )
-            if reason == "entry":
-                seen_stopped_event += 1
-
-        self.assertEqual(seen_stopped_event, 1, "expect only one stopped entry event.")
-
     @skipIfAsan
     @skipIfWindows
     @skipIf(oslist=["linux"], archs=["arm$"])  # Always times out on buildbot
@@ -92,11 +71,8 @@ def test_stopOnEntry(self):
         self.build_and_launch(program, console="integratedTerminal", stopOnEntry=True)
         [bp_main] = self.set_function_breakpoints(["main"])
 
-        self.dap_server.request_continue()  # sends configuration done
-        stopped_events = self.dap_server.wait_for_stopped()
-        # We should be stopped at the entry point.
-        self.assertGreaterEqual(len(stopped_events), 0, "expect stopped events")
-        self.verify_stopped_on_entry(stopped_events)
+        self.dap_server.request_configurationDone()
+        self.verify_stop_on_entry()
 
         # Then, if we continue, we should hit the breakpoint at main.
         self.dap_server.request_continue()
@@ -105,8 +81,7 @@ def test_stopOnEntry(self):
         # Restart and check that we still get a stopped event before reaching
         # main.
         self.dap_server.request_restart()
-        stopped_events = self.dap_server.wait_for_stopped()
-        self.verify_stopped_on_entry(stopped_events)
+        self.verify_stop_on_entry()
 
         # continue to main
         self.dap_server.request_continue()
diff --git a/lldb/tools/lldb-dap/EventHelper.cpp b/lldb/tools/lldb-dap/EventHelper.cpp
index c5d5f2bb59b42..12d9e21c52ab3 100644
--- a/lldb/tools/lldb-dap/EventHelper.cpp
+++ b/lldb/tools/lldb-dap/EventHelper.cpp
@@ -176,7 +176,7 @@ llvm::Error SendThreadStoppedEvent(DAP &dap, bool on_entry) {
 
   llvm::DenseSet<lldb::tid_t> old_thread_ids;
   old_thread_ids.swap(dap.thread_ids);
-  uint32_t stop_id = process.GetStopID();
+  uint32_t stop_id = on_entry ? 0 : process.GetStopID();
   const uint32_t num_threads = process.GetNumThreads();
 
   // First make a pass through the threads to see if the focused thread
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 2780a5b7748e8..1a3a6701b194d 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -711,7 +711,7 @@ llvm::json::Value CreateThreadStopped(DAP &dap, lldb::SBThread &thread,
     break;
   }
   if (stop_id == 0)
-    body.try_emplace("reason", "entry");
+    body["reason"] = "entry";
   const lldb::tid_t tid = thread.GetThreadID();
   body.try_emplace("threadId", (int64_t)tid);
   // If no description has been set, then set it to the default thread stopped

From 74ffe1cc16269e558d3877c06f68eb9779eaaf3e Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Fri, 31 Oct 2025 13:38:18 -0700
Subject: [PATCH 02/20] [DirectX] Annotate interfaces for DLL export (#165914)

This is largely based off of #143615, but for the DirectX target which
is still in experimental.
---
 llvm/include/llvm/InitializePasses.h                         | 1 -
 llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp                | 3 ++-
 llvm/lib/Target/DirectX/DirectXTargetMachine.cpp             | 3 ++-
 llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp | 3 ++-
 llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp     | 3 ++-
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 581b4ad161daa..c8196d8a7ef48 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -90,7 +90,6 @@ LLVM_ABI void initializeDSELegacyPassPass(PassRegistry &);
 LLVM_ABI void initializeDXILMetadataAnalysisWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeDXILMetadataAnalysisWrapperPrinterPass(PassRegistry &);
 LLVM_ABI void initializeDXILResourceBindingWrapperPassPass(PassRegistry &);
-LLVM_ABI void initializeDXILResourceImplicitBindingLegacyPass(PassRegistry &);
 LLVM_ABI void initializeDXILResourceTypeWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeDXILResourceWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeDeadMachineInstructionElimPass(PassRegistry &);
diff --git a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
index 15def3637c5a7..b6bbb201f5c5d 100644
--- a/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
+++ b/llvm/lib/Target/DirectX/DirectXAsmPrinter.cpp
@@ -52,6 +52,7 @@ void DXILAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   emitGlobalConstant(GV->getDataLayout(), GV->getInitializer());
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXAsmPrinter() {
   RegisterAsmPrinter<DXILAsmPrinter> X(getTheDirectXTarget());
 }
diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
index bcf84403b2c0d..84b1a313df2ea 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp
@@ -53,7 +53,8 @@
 
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXTarget() {
   RegisterTargetMachine<DirectXTargetMachine> X(getTheDirectXTarget());
   auto *PR = PassRegistry::getPassRegistry();
   initializeDXILIntrinsicExpansionLegacyPass(*PR);
diff --git a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
index 9a14c01f62ae7..62ad014f3739f 100644
--- a/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
+++ b/llvm/lib/Target/DirectX/MCTargetDesc/DirectXMCTargetDesc.cpp
@@ -132,7 +132,8 @@ static MCRegisterInfo *createDirectXMCRegisterInfo(const Triple &Triple) {
 
 static MCInstrInfo *createDirectXMCInstrInfo() { return new MCInstrInfo(); }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXTargetMC() {
   Target &T = getTheDirectXTarget();
   RegisterMCAsmInfo<DirectXMCAsmInfo> X(T);
   TargetRegistry::RegisterMCInstrInfo(T, createDirectXMCInstrInfo);
diff --git a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
index ae01626e5229d..934bd1b0e8adb 100644
--- a/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
+++ b/llvm/lib/Target/DirectX/TargetInfo/DirectXTargetInfo.cpp
@@ -24,7 +24,8 @@ Target &getTheDirectXTarget() {
 
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeDirectXTargetInfo() {
   RegisterTarget<Triple::dxil, /*HasJIT=*/false> X(
       getTheDirectXTarget(), "dxil", "DirectX Intermediate Language", "DXIL");
 }

From 1e2ed8a5861a304d42214fc48c1fa1b9211b3540 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Fri, 31 Oct 2025 20:36:48 +0000
Subject: [PATCH 03/20] [MLGO] Update MLRegAlloc Test

This was broken by 5322fb6268208a8fc031fb13573dac9729d05db6. Update the
test to be a little more resilient to flaky failures and to pass after
those changes. We should probably delete this now that we have MIR2Vec,
but punting that for now.
---
 llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
index bd8d882cda39b..9dd402d13b8e0 100644
--- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
+++ b/llvm/test/CodeGen/MLRegAlloc/dev-mode-extra-features-logging.ll
@@ -26,7 +26,7 @@
 ; Also, the first eviction problem is significantly less than 300 instructions. Check
 ; that there is a zero value.
 ; Note: we're regex-ing some of the opcodes to avoid test flakyness.
-; CHECK: instructions: 20,{{([0-9]{4})}},1{{([0-9]{3})}},2{{([0-9]{3})}},{{.*}},0,
+; CHECK: instructions: 20,{{([0-9]{4})}},{{([0-9]{4})}},{{([0-9]{4})}},{{.*}},0,
 ; Only the candidate virtreg and the 10th LR are included in this problem. Make
 ; sure the other LRs have values of zero. There are 2700 0s followed by some 1s.
 ; There's a limit to how many repetitions can be matched.

From c528f60573875652d75a394360fe64eee813919b Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Fri, 31 Oct 2025 21:02:55 +0000
Subject: [PATCH 04/20] Revert "[SLP][NFC]Add a test with the incorrect
 minbitwidth in alternate nodes, NFC"

This reverts commit 0dca7ee4480f11cd0230d316ccc5d2c7234a4b31.

This broke check-llvm, including on premerge.

https://lab.llvm.org/buildbot/#/builders/137/builds/28194
https://lab.llvm.org/staging/#/builders/21/builds/7649
---
 ...ernate-opcode-strict-bitwidth-than-main.ll | 36 -------------------
 1 file changed, 36 deletions(-)
 delete mode 100644 llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
deleted file mode 100644
index 959b2350d9d78..0000000000000
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
-
-define float @test(i8 %0) {
-; CHECK-LABEL: define float @test(
-; CHECK-SAME: i8 [[TMP0:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> <i8 poison, i8 0>, i8 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i32> [[TMP2]], <i32 2, i32 27>
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i32> [[TMP2]], <i32 2, i32 27>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    switch i32 [[TMP8]], label %[[EXIT:.*]] [
-; CHECK-NEXT:      i32 0, label %[[EXIT]]
-; CHECK-NEXT:      i32 1, label %[[EXIT]]
-; CHECK-NEXT:    ]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret float 0.000000e+00
-;
-entry:
-  %1 = sext i8 0 to i32
-  %2 = lshr i32 %1, 27
-  %3 = sext i8 %0 to i32
-  %reass.add.epil = mul i32 %3, 2
-  %4 = or i32 %reass.add.epil, %2
-  switch i32 %4, label %exit [
-  i32 0, label %exit
-  i32 1, label %exit
-  ]
-
-exit:
-  ret float 0.000000e+00
-}

From cf829cc11c0db7500bb4222c5c5c22b276a314d9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 31 Oct 2025 14:50:17 -0700
Subject: [PATCH 05/20] AMDGPU: Add baseline test for #161651 (#165921)

---
 .../umin-sub-to-usubo-select-combine.ll       | 236 ++++++++++++++++++
 1 file changed, 236 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll

diff --git a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000000000..22e4a24435f12
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,236 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+
+define i16 @v_underflow_compare_fold_i16(i16 %a, i16 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u16_e32 v1, v0, v1
+; GFX9-NEXT:    v_min_u16_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u16 v0.h, v0.l, v1.l
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u16 v0.l, v0.h, v0.l
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i16 %a, %b
+  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32(i32 %a, i32 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i32 %a, %b
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32_commute(i32 %a, i32 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32_commute:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i32 %a, %b
+  %cond = call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+  ret i32 %cond
+}
+
+define i32 @v_underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i32_multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v1
+; GFX9-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX9-NEXT:    global_store_dword v[2:3], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i32_multi_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX11-NEXT:    global_store_b32 v[2:3], v1, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i32 %a, %b
+  store i32 %sub, ptr addrspace(1) %ptr
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64(i64 %a, i64 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %a, %b
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64_commute(i64 %a, i64 %b) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64_commute:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %a, %b
+  %cond = call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+  ret i64 %cond
+}
+
+define i64 @v_underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) #0 {
+; GFX9-LABEL: v_underflow_compare_fold_i64_multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_underflow_compare_fold_i64_multi_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
+; GFX11-NEXT:    global_store_b64 v[4:5], v[2:3], off
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sub = sub i64 %a, %b
+  store i64 %sub, ptr addrspace(1) %ptr
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+define amdgpu_ps i16 @s_underflow_compare_fold_i16(i16 inreg %a, i16 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_i32 s1, s0, s1
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX9-NEXT:    s_min_u32 s0, s1, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_sub_i32 s1, s0, s1
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_min_u32 s0, s1, s0
+; GFX11-NEXT:    ; return to shader part epilog
+  %sub = sub i16 %a, %b
+  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+define amdgpu_ps i32 @s_underflow_compare_fold_i32(i32 inreg %a, i32 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_i32 s1, s0, s1
+; GFX9-NEXT:    s_min_u32 s0, s1, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_sub_i32 s1, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_min_u32 s0, s1, s0
+; GFX11-NEXT:    ; return to shader part epilog
+  %sub = sub i32 %a, %b
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+define amdgpu_ps i64 @s_underflow_compare_fold_i64(i64 inreg %a, i64 inreg %b) #0 {
+; GFX9-LABEL: s_underflow_compare_fold_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sub_u32 s2, s0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_subb_u32 s3, s1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
+; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_underflow_compare_fold_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_sub_u32 s2, s0, s2
+; GFX11-NEXT:    s_subb_u32 s3, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[2:3], s[0:1]
+; GFX11-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX11-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX11-NEXT:    ; return to shader part epilog
+  %sub = sub i64 %a, %b
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+attributes #0 = { nounwind }

From 4ac74fc6143b787e9e9ccd088b27ab6fe384b77c Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Fri, 31 Oct 2025 13:13:58 -0700
Subject: [PATCH 06/20] [SLP][NFC]Add a test with the incorrect minbitwidth in
 alternate nodes, NFC

---
 ...ernate-opcode-strict-bitwidth-than-main.ll | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
new file mode 100644
index 0000000000000..cc2e16e2b099b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define float @test(i8 %0) {
+; CHECK-LABEL: define float @test(
+; CHECK-SAME: i8 [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> <i8 poison, i8 0>, i8 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i16> [[TMP2]], <i16 2, i16 27>
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i16> [[TMP2]], <i16 2, i16 27>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    switch i32 [[TMP8]], label %[[EXIT:.*]] [
+; CHECK-NEXT:      i32 0, label %[[EXIT]]
+; CHECK-NEXT:      i32 1, label %[[EXIT]]
+; CHECK-NEXT:    ]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+entry:
+  %1 = sext i8 0 to i32
+  %2 = lshr i32 %1, 27
+  %3 = sext i8 %0 to i32
+  %reass.add.epil = mul i32 %3, 2
+  %4 = or i32 %reass.add.epil, %2
+  switch i32 %4, label %exit [
+  i32 0, label %exit
+  i32 1, label %exit
+  ]
+
+exit:
+  ret float 0.000000e+00
+}

From 74d4870aa503e90a614e4bc725d670abc5f90218 Mon Sep 17 00:00:00 2001
From: Tom Yang <zhenyutyang@gmail.com>
Date: Fri, 31 Oct 2025 15:08:39 -0700
Subject: [PATCH 07/20] update ManualDWARFIndex::Index to use std::once
 (#165896)

Small change to use (what I think is) a better practice -- we were using
the `m_indexed` bool member to make sure we called `Index()` once, but
we should just use `std::once`! This change shouldn't affect
functionality.

This change may also make concurrent access to `Index()` thread-safe,
though the ManualDWARFIndex API isn't completely thread-safe due to
`Decode()`. I'm not sure if ManualDWARFIndex was ever intended to be
thread-safe.

Test Plan:

`ninja check-lldb`

Tested basic debugging workflow of a couple of large projects I had
built. Basically:
```
(lldb) target create <project>
(lldb) b main
(lldb) r
(lldb) step
...
```

I A/B tested the performance of launching several modules with parallel
module loading and didn't observe any performance regressions.

---------

Co-authored-by: Tom Yang <toyang@fb.com>
---
 lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp | 7 +++----
 lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h   | 8 +++++++-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
index d90108f687f84..36dee1470e0a2 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp
@@ -22,7 +22,6 @@
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/Timer.h"
 #include "lldb/lldb-private-enumerations.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ThreadPool.h"
 #include <atomic>
 #include <optional>
@@ -33,10 +32,10 @@ using namespace lldb_private::plugin::dwarf;
 using namespace llvm::dwarf;
 
 void ManualDWARFIndex::Index() {
-  if (m_indexed)
-    return;
-  m_indexed = true;
+  std::call_once(m_indexed_flag, [this]() { IndexImpl(); });
+}
 
+void ManualDWARFIndex::IndexImpl() {
   ElapsedTime elapsed(m_index_time);
   LLDB_SCOPED_TIMERF("%p", static_cast<void *>(m_dwarf));
   if (LoadFromCache()) {
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
index 0b5b2f3e84309..41e0e620a4896 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.h
@@ -66,8 +66,14 @@ class ManualDWARFIndex : public DWARFIndex {
   void Dump(Stream &s) override;
 
 private:
+  /// Reads the DWARF debug info to build the index once.
+  ///
+  /// Should be called before attempting to retrieve symbols.
   void Index();
 
+  /// Call `ManualDWARFIndex::Index()` instead.
+  void IndexImpl();
+
   /// Decode a serialized version of this object from data.
   ///
   /// \param data
@@ -170,7 +176,7 @@ class ManualDWARFIndex : public DWARFIndex {
   llvm::DenseSet<uint64_t> m_type_sigs_to_avoid;
 
   IndexSet<NameToDIE> m_set;
-  bool m_indexed = false;
+  std::once_flag m_indexed_flag;
 };
 } // namespace dwarf
 } // namespace lldb_private::plugin

From d310693bde68b49cdb5c5877aadffb41d35c45fb Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 31 Oct 2025 15:11:13 -0700
Subject: [PATCH 08/20] [SelectionDAG] Use GetPromotedInteger when promoting
 integer operands of PATCHPOINT/STACKMAP. (#165926)

This is consistent with other promotion, but causes negative constants
to be sign extended instead of zero extended in some cases.

I guess getNode and type legalizer are inconsistent about what
ANY_EXTEND of a constant does.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 8 ++------
 llvm/test/CodeGen/AArch64/stackmap.ll                  | 4 ++--
 llvm/test/CodeGen/SystemZ/stackmap.ll                  | 4 ++--
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index b1776eaae6e86..44e5a187c4281 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2871,18 +2871,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SET_ROUNDING(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo) {
   assert(OpNo > 1); // Because the first two arguments are guaranteed legal.
   SmallVector<SDValue> NewOps(N->ops());
-  SDValue Operand = N->getOperand(OpNo);
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType());
-  NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand);
+  NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]);
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
   assert(OpNo >= 7);
   SmallVector<SDValue> NewOps(N->ops());
-  SDValue Operand = N->getOperand(OpNo);
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType());
-  NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand);
+  NewOps[OpNo] = GetPromotedInteger(NewOps[OpNo]);
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
diff --git a/llvm/test/CodeGen/AArch64/stackmap.ll b/llvm/test/CodeGen/AArch64/stackmap.ll
index 995d2545c6359..26221d0c26eb2 100644
--- a/llvm/test/CodeGen/AArch64/stackmap.ll
+++ b/llvm/test/CodeGen/AArch64/stackmap.ll
@@ -81,14 +81,14 @@
 ; CHECK-NEXT:   .hword  8
 ; CHECK-NEXT:   .hword  0
 ; CHECK-NEXT:   .hword  0
-; CHECK-NEXT:   .word   65535
+; CHECK-NEXT:   .word   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .hword  8
 ; CHECK-NEXT:   .hword  0
 ; CHECK-NEXT:   .hword  0
-; CHECK-NEXT:   .word   65535
+; CHECK-NEXT:   .word   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   0
diff --git a/llvm/test/CodeGen/SystemZ/stackmap.ll b/llvm/test/CodeGen/SystemZ/stackmap.ll
index 05b8de756c032..f414ea33a6e80 100644
--- a/llvm/test/CodeGen/SystemZ/stackmap.ll
+++ b/llvm/test/CodeGen/SystemZ/stackmap.ll
@@ -84,14 +84,14 @@
 ; CHECK-NEXT:   .short  8
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   65535
+; CHECK-NEXT:   .long   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .short  8
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   65535
+; CHECK-NEXT:   .long   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   0

From 964c7711f4384b08f051d17da888d35d03a3024a Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Fri, 31 Oct 2025 15:07:26 -0700
Subject: [PATCH 09/20] [SLP]Fix the minbitwidth analysis for slternate opcodes

If the laternate operation is more stricter than the main operation, we
cannot rely on the analysis of the main operation. In such case, better
to avoid doing the analysis at all, since it may affect the overall
result and lead to incorrect optimization

Fixes #165878
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 21 +++++++++++++++++++
 ...ernate-opcode-strict-bitwidth-than-main.ll | 14 ++++++-------
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1b55a3b235228..34b405ced8c0a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22134,6 +22134,27 @@ bool BoUpSLP::collectValuesToDemote(
         {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
          VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
 
+  if (E.isAltShuffle()) {
+    // Combining these opcodes may lead to incorrect analysis, skip for now.
+    auto IsDangerousOpcode = [](unsigned Opcode) {
+      switch (Opcode) {
+      case Instruction::Shl:
+      case Instruction::AShr:
+      case Instruction::LShr:
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::URem:
+      case Instruction::SRem:
+        return true;
+      default:
+        break;
+      }
+      return false;
+    };
+    if (IsDangerousOpcode(E.getAltOpcode()))
+      return FinalAnalysis();
+  }
+
   switch (E.getOpcode()) {
 
   // We can always demote truncations and extensions. Since truncations can
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
index cc2e16e2b099b..959b2350d9d78 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
@@ -6,14 +6,12 @@ define float @test(i8 %0) {
 ; CHECK-SAME: i8 [[TMP0:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> <i8 poison, i8 0>, i8 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i16> [[TMP2]], <i16 2, i16 27>
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i16> [[TMP2]], <i16 2, i16 27>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP9]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i32> [[TMP2]], <i32 2, i32 27>
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i32> [[TMP2]], <i32 2, i32 27>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    switch i32 [[TMP8]], label %[[EXIT:.*]] [
 ; CHECK-NEXT:      i32 0, label %[[EXIT]]

From c87e3c922d956c0d4a6d9910094a2e060f018827 Mon Sep 17 00:00:00 2001
From: Yifei Xu <yifei.xu@utexas.edu>
Date: Fri, 31 Oct 2025 22:29:36 +0000
Subject: [PATCH 10/20] [MLIR][XeGPU] Remove an unused include and break
 circular dependency in bazel build (#165930)

It will otherwise introduce a circular dependency XeGPUDialect ->
XeGPUUtils -> XeGPUDialect.
---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp        | 1 -
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 -
 2 files changed, 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 6b4c185d7d897..83406c8c75dcf 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -11,7 +11,6 @@
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
-#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 1385e1a802d5b..83414ceed5ca5 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3729,7 +3729,6 @@ cc_library(
         ":XeGPUAttrInterfaceIncGen",
         ":XeGPUEnumsIncGen",
         ":XeGPUIncGen",
-        ":XeGPUUtils",
         ":XeGPUuArch",
         ":XeVMDialect",
         "//llvm:Support",

From 045f3ceb84d982b5ccc39144f26d41674f4059fc Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Fri, 31 Oct 2025 15:45:36 -0700
Subject: [PATCH 11/20] Add test cases to profcheck-xfail.txt in unfixed (yet)
 areas (#165933)

A remaining failing one, under SimplifyCFG (which is pass that we did fix) is covered in #165931
---
 llvm/utils/profcheck-xfail.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 83bffc70574a8..380b162d8c58c 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -517,8 +517,11 @@ Instrumentation/TypeSanitizer/alloca-only.ll
 Instrumentation/TypeSanitizer/anon.ll
 Instrumentation/TypeSanitizer/basic.ll
 Instrumentation/TypeSanitizer/basic-nosan.ll
+Instrumentation/TypeSanitizer/basic_outlined.ll
+Instrumentation/TypeSanitizer/basic_verify_outlined.ll
 Instrumentation/TypeSanitizer/byval.ll
 Instrumentation/TypeSanitizer/globals.ll
+Instrumentation/TypeSanitizer/globals_outlined.ll
 Instrumentation/TypeSanitizer/invalid-metadata.ll
 Instrumentation/TypeSanitizer/memintrinsics.ll
 Instrumentation/TypeSanitizer/nosanitize.ll
@@ -729,6 +732,7 @@ Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll
 Transforms/ExpandVariadics/intrinsics.ll
 Transforms/FixIrreducible/basic.ll
 Transforms/FixIrreducible/bug45623.ll
+Transforms/FixIrreducible/callbr.ll
 Transforms/FixIrreducible/nested.ll
 Transforms/FixIrreducible/switch.ll
 Transforms/GCOVProfiling/atomic-counter.ll
@@ -1106,6 +1110,7 @@ Transforms/LoopSimplifyCFG/update_parents.ll
 Transforms/LoopUnroll/peel-last-iteration-expansion-cost.ll
 Transforms/LoopUnroll/peel-last-iteration-with-guards.ll
 Transforms/LoopUnroll/peel-last-iteration-with-variable-trip-count.ll
+Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
 Transforms/LoopVersioning/add-phi-update-users.ll
 Transforms/LoopVersioning/basic.ll
 Transforms/LoopVersioning/bound-check-partially-known.ll

From 6adef40e756eb427508ae245bfa0fd846573782e Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Fri, 31 Oct 2025 16:10:59 -0700
Subject: [PATCH 12/20] [SimplifyCFG] Don't propagate weights to unconditional
 branches in `turnSwitchRangeIntoICmp` (#165931)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #161000 introduced a bug whereby the IR would become invalid by having an unconditional branch have `!prof`​attached to it. This only became evident in PR #165744, because the IR of `test/Transforms/SimplifyCFG/pr165301.ll`​was simple enough to both (1) introduce the unconditional branch, and (2) survive in that fashion until the end of the pass (simplifycfg) and thus trip the verifier.
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp    |  2 +-
 llvm/test/Transforms/SimplifyCFG/pr165301.ll | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 6addcfab15125..cbc604e87cf1a 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5956,7 +5956,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
   }
 
   // Update weight for the newly-created conditional branch.
-  if (hasBranchWeightMD(*SI)) {
+  if (hasBranchWeightMD(*SI) && NewBI->isConditional()) {
     SmallVector<uint64_t, 8> Weights;
     getBranchWeights(SI, Weights);
     if (Weights.size() == 1 + SI->getNumCases()) {
diff --git a/llvm/test/Transforms/SimplifyCFG/pr165301.ll b/llvm/test/Transforms/SimplifyCFG/pr165301.ll
index 4a539d77af3cb..1df655250f57e 100644
--- a/llvm/test/Transforms/SimplifyCFG/pr165301.ll
+++ b/llvm/test/Transforms/SimplifyCFG/pr165301.ll
@@ -1,11 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
 ; RUN: opt -S -passes="simplifycfg<switch-range-to-icmp>" < %s | FileCheck %s
 
 ; Make sure there's no use after free when removing incoming values from PHI nodes
 
-define i32 @pr165301(i1 %cond) {
+define i32 @pr165301(i1 %cond) !prof !0 {
 ; CHECK-LABEL: define i32 @pr165301(
-; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-SAME: i1 [[COND:%.*]]) !prof [[PROF0:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[SWITCHBB:.*]]
 ; CHECK:       [[SWITCHBB]]:
@@ -18,9 +18,14 @@ switchbb:
   switch i1 %cond, label %default [
   i1 false, label %switchbb
   i1 true, label %switchbb
-  ]
+  ], !prof !1
 
 default:
   %phi.lcssa = phi i32 [ 0, %switchbb ]
   ret i32 %phi.lcssa
 }
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 2, i32 3, i32 5}
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10}
+;.

From 3d1aece3d166a77788af19c845cd39ae668aaa7c Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Fri, 31 Oct 2025 16:16:43 -0700
Subject: [PATCH 13/20] [docs] Fix GlobalISel sync up gcal link to point to the
 new one.

---
 llvm/docs/GettingInvolved.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 039d61624093d..0dba9412564d4 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -225,7 +225,7 @@ what to add to your calendar invite.
      -
    * - GlobalISel
      - Every 2nd Tuesday of the month
-     - `gcal <https://calendar.google.com/calendar/u/0?cid=ZDcyMjc0ZjZiZjNhMzFlYmE3NTNkMWM2MGM2NjM5ZWU3ZDE2MjM4MGFlZDc2ZjViY2UyYzMwNzVhZjk4MzQ4ZEBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
+     - `gcal <https://calendar.google.com/calendar/u/0?cid=YWZjNzhmMzE4MDNlNTAyNGY1NmE1MDIyODY0YTYwZmJmYzRjYTEwNTE1NmUxODA2NzBkYTliY2ZhYTVkNjk0NUBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
      - `Meeting details/agenda <https://docs.google.com/document/d/1Ry8O4-Tm5BFj9AMjr8qTQFU80z-ptiNQ62687NaIvLs/edit?usp=sharing>`__
 
 

From be2081d9457ed095c4a6ebe2a920f0f7b76369c6 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@amd.com>
Date: Fri, 31 Oct 2025 16:48:20 -0700
Subject: [PATCH 14/20] [AMDGPU] Set VADDR4 field to NULL for tensor ops for
 gfx1250 (#165917)

This is based on the latest spec.
---
 llvm/lib/Target/AMDGPU/MIMGInstructions.td    |  4 ++-
 llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s      | 16 +++++-----
 .../AMDGPU/gfx1250_dasm_vimage.txt            | 32 +++++++++----------
 3 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index d95013123aced..65dce74a1e894 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -2116,8 +2116,10 @@ class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = p
   let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
   let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
 
+  // Set VADDR4 to NULL
+  let vaddr4 = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+
   // set to 0 based on SPG.
-  let vaddr4 = 0;
   let rsrc = 0;
   let vdata = 0;
   let d16 = 0;
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s
index fec8ba19f93fe..0a480a73cde5b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s
@@ -2,33 +2,33 @@
 ; RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
 
 tensor_load_to_lds s[0:3], s[4:11]
-// GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c]
+// GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS
-// GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c]
+// GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19]
-// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10]
+// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV
-// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10]
+// GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_store_from_lds s[0:3], s[4:11]
-// GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c]
+// GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS
-// GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c]
+// GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19]
-// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10]
+// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV
-// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10]
+// GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt
index 9afaa075ea838..800579391d8eb 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt
@@ -1,25 +1,25 @@
 # RUN: llvm-mc -disassemble -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
 
-# GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c]
-0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c
+# GFX1250: tensor_load_to_lds s[0:3], s[4:11] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c]
+0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c
 
-# GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c]
-0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c
+# GFX1250: tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c]
+0x01,0x00,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c
 
-# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10]
-0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10
+# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10]
+0x01,0x00,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10
 
-# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10]
-0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10
+# GFX1250: tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10]
+0x01,0x00,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10
 
-# GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c]
-0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x7c,0x7c
+# GFX1250: tensor_store_from_lds s[0:3], s[4:11] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c]
+0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x7c,0x7c
 
-# GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c]
-0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x00,0x00,0x04,0x7c,0x7c
+# GFX1250: tensor_store_from_lds s[0:3], s[4:11] th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c]
+0x01,0x40,0x71,0xd0,0x00,0x00,0x3c,0x7c,0x00,0x04,0x7c,0x7c
 
-# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10]
-0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x00,0x00,0x04,0x0c,0x10
+# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10]
+0x01,0x40,0x71,0xd0,0x00,0x00,0x00,0x7c,0x00,0x04,0x0c,0x10
 
-# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10]
-0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x00,0x00,0x04,0x0c,0x10
+# GFX1250: tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10]
+0x01,0x40,0x71,0xd0,0x00,0x00,0x68,0x7c,0x00,0x04,0x0c,0x10

From 128af4503aa3c5dac50ccbe50dec4ecc56d2cfb2 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 31 Oct 2025 18:13:35 -0700
Subject: [PATCH 15/20] [lldb-dap] Move Options.td into tool subdirectory (NFC)
 (#165925)

---
 lldb/tools/lldb-dap/CMakeLists.txt        | 3 ---
 lldb/tools/lldb-dap/tool/CMakeLists.txt   | 4 ++++
 lldb/tools/lldb-dap/{ => tool}/Options.td | 0
 3 files changed, 4 insertions(+), 3 deletions(-)
 rename lldb/tools/lldb-dap/{ => tool}/Options.td (100%)

diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt
index 7db334ca56bcf..dd1bbbdddfc59 100644
--- a/lldb/tools/lldb-dap/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/CMakeLists.txt
@@ -1,9 +1,6 @@
 # We need to include the llvm components we depend on manually, as liblldb does
 # not re-export those.
 set(LLVM_LINK_COMPONENTS Support)
-set(LLVM_TARGET_DEFINITIONS Options.td)
-tablegen(LLVM Options.inc -gen-opt-parser-defs)
-add_public_tablegen_target(LLDBDAPOptionsTableGen)
 
 add_lldb_library(lldbDAP
   Breakpoint.cpp
diff --git a/lldb/tools/lldb-dap/tool/CMakeLists.txt b/lldb/tools/lldb-dap/tool/CMakeLists.txt
index b39a4ed9c40e7..5335d25c5d450 100644
--- a/lldb/tools/lldb-dap/tool/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/tool/CMakeLists.txt
@@ -1,3 +1,7 @@
+set(LLVM_TARGET_DEFINITIONS Options.td)
+tablegen(LLVM Options.inc -gen-opt-parser-defs)
+add_public_tablegen_target(LLDBDAPOptionsTableGen)
+
 add_lldb_tool(lldb-dap
   lldb-dap.cpp
 
diff --git a/lldb/tools/lldb-dap/Options.td b/lldb/tools/lldb-dap/tool/Options.td
similarity index 100%
rename from lldb/tools/lldb-dap/Options.td
rename to lldb/tools/lldb-dap/tool/Options.td

From 34f1dbfa3814b2daed9c4599c5120cb56f2c3ec0 Mon Sep 17 00:00:00 2001
From: Mike <FruitClover@gmail.com>
Date: Sat, 1 Nov 2025 04:25:18 +0300
Subject: [PATCH 16/20] [mlir][memref] Refine doc examples for operations
 (#165889)

Some of the examples contain typos; some of them use outdated assembly format,
and some annotations are missing. This is the best effort to keep them
"parsable" (assuming that most of the types are already defined).
---
 .../mlir/Dialect/MemRef/IR/MemRefOps.td       | 72 ++++++++++---------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index b39207fc30dd7..e00f3c1526005 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -323,8 +323,8 @@ def MemRef_ReallocOp : MemRef_Op<"realloc"> {
 
     ```mlir
     %new = memref.realloc %old : memref<64xf32> to memref<124xf32>
-    %4 = memref.load %new[%index]   // ok
-    %5 = memref.load %old[%index]   // undefined behavior
+    %4 = memref.load %new[%index] : memref<124xf32> // ok
+    %5 = memref.load %old[%index] : memref<64xf32>  // undefined behavior
     ```
   }];
 
@@ -445,9 +445,10 @@ def MemRef_AllocaScopeOp : MemRef_Op<"alloca_scope",
     operation:
 
     ```mlir
-    %result = memref.alloca_scope {
+    %result = memref.alloca_scope -> f32 {
+      %value = arith.constant 1.0 : f32
       ...
-      memref.alloca_scope.return %value
+      memref.alloca_scope.return %value : f32
     }
     ```
 
@@ -478,7 +479,7 @@ def MemRef_AllocaScopeReturnOp : MemRef_Op<"alloca_scope.return",
     to indicate which values are going to be returned. For example:
 
     ```mlir
-    memref.alloca_scope.return %value
+    memref.alloca_scope.return %value : f32
     ```
   }];
 
@@ -543,11 +544,11 @@ def MemRef_CastOp : MemRef_Op<"cast", [
     Example:
 
     ```mlir
-    Cast to concrete shape.
-        %4 = memref.cast %1 : memref<*xf32> to memref<4x?xf32>
+    // Cast to concrete shape.
+    %4 = memref.cast %1 : memref<*xf32> to memref<4x?xf32>
 
-    Erase rank information.
-        %5 = memref.cast %1 : memref<4x?xf32> to memref<*xf32>
+    // Erase rank information.
+    %5 = memref.cast %1 : memref<4x?xf32> to memref<*xf32>
     ```
   }];
 
@@ -613,8 +614,8 @@ def MemRef_DeallocOp : MemRef_Op<"dealloc", [MemRefsNormalizable]> {
     Example:
 
     ```mlir
-    %0 = memref.alloc() : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1), 1>>
-    memref.dealloc %0 : memref<8x64xf32,  affine_map<(d0, d1) -> (d0, d1), 1>>
+    %0 = memref.alloc() : memref<8x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1>
+    memref.dealloc %0 : memref<8x64xf32,  affine_map<(d0, d1) -> (d0, d1)>, 1>
     ```
   }];
 
@@ -728,13 +729,13 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
     space 1 at indices [%k, %l], would be specified as follows:
 
     ```mlir
-    %num_elements = arith.constant 256
+    %num_elements = arith.constant 256 : index
     %idx = arith.constant 0 : index
-    %tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 4>
-    dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] :
-      memref<40 x 128 x f32>, affine_map<(d0) -> (d0)>, 0>,
-      memref<2 x 1024 x f32>, affine_map<(d0) -> (d0)>, 1>,
-      memref<1 x i32>, affine_map<(d0) -> (d0)>, 2>
+    %tag = memref.alloc() : memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
+    memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx] :
+      memref<40 x 128 x f32, affine_map<(d0, d1) -> (d0, d1)>, 0>,
+      memref<2 x 1024 x f32, affine_map<(d0, d1) -> (d0, d1)>, 1>,
+      memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
     ```
 
     If %stride and %num_elt_per_stride are specified, the DMA is expected to
@@ -742,8 +743,8 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
     memory space 0 until %num_elements are transferred.
 
     ```mlir
-    dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride,
-              %num_elt_per_stride :
+    memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%idx], %stride,
+                     %num_elt_per_stride :
     ```
 
     * TODO: add additional operands to allow source and destination striding, and
@@ -891,10 +892,10 @@ def MemRef_DmaWaitOp : MemRef_Op<"dma_wait"> {
    Example:
 
    ```mlir
-    dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] :
-      memref<2048 x f32>, affine_map<(d0) -> (d0)>, 0>,
-      memref<256 x f32>, affine_map<(d0) -> (d0)>, 1>
-      memref<1 x i32>, affine_map<(d0) -> (d0)>, 2>
+    memref.dma_start %src[%i, %j], %dst[%k, %l], %num_elements, %tag[%index] :
+      memref<2048 x f32, affine_map<(d0) -> (d0)>, 0>,
+      memref<256 x f32, affine_map<(d0) -> (d0)>, 1>,
+      memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
     ...
     ...
     dma_wait %tag[%index], %num_elements : memref<1 x i32, affine_map<(d0) -> (d0)>, 2>
@@ -1004,8 +1005,8 @@ def MemRef_ExtractStridedMetadataOp : MemRef_Op<"extract_strided_metadata", [
 
     ```mlir
       %base, %offset, %sizes:2, %strides:2 =
-        memref.extract_strided_metadata %memref :
-          memref<10x?xf32>, index, index, index, index, index
+        memref.extract_strided_metadata %memref : memref<10x?xf32>
+          -> memref<f32>, index, index, index, index, index
 
       // After folding, the type of %m2 can be memref<10x?xf32> and further
       // folded to %memref.
@@ -1013,7 +1014,7 @@ def MemRef_ExtractStridedMetadataOp : MemRef_Op<"extract_strided_metadata", [
           offset: [%offset],
           sizes: [%sizes#0, %sizes#1],
           strides: [%strides#0, %strides#1]
-        : memref<f32> to memref<?x?xf32, offset: ?, strides: [?, ?]>
+        : memref<f32> to memref<?x?xf32, strided<[?, ?], offset:?>>
     ```
   }];
 
@@ -1182,10 +1183,10 @@ def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> {
 
     ```mlir
     // Private variable with an initial value.
-    memref.global "private" @x : memref<2xf32> = dense<0.0,2.0>
+    memref.global "private" @x : memref<2xf32> = dense<[0.0, 2.0]>
 
     // Private variable with an initial value and an alignment (power of 2).
-    memref.global "private" @x : memref<2xf32> = dense<0.0,2.0> {alignment = 64}
+    memref.global "private" @x : memref<2xf32> = dense<[0.0, 2.0]> {alignment = 64}
 
     // Declaration of an external variable.
     memref.global "private" @y : memref<4xi32>
@@ -1194,7 +1195,7 @@ def MemRef_GlobalOp : MemRef_Op<"global", [Symbol]> {
     memref.global @z : memref<3xf16> = uninitialized
 
     // Externally visible constant variable.
-    memref.global constant @c : memref<2xi32> = dense<1, 4>
+    memref.global constant @c : memref<2xi32> = dense<[1, 4]>
     ```
   }];
 
@@ -1555,7 +1556,8 @@ def MemRef_ReinterpretCastOp
     %dst = memref.reinterpret_cast %src to
       offset: [%offset],
       sizes: [%sizes],
-      strides: [%strides]
+      strides: [%strides] :
+      memref<*xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
     ```
     means that `%dst`'s descriptor will be:
     ```mlir
@@ -1695,12 +1697,12 @@ def MemRef_ReshapeOp: MemRef_Op<"reshape", [
     ```mlir
     // Reshape statically-shaped memref.
     %dst = memref.reshape %src(%shape)
-             : (memref<4x1xf32>, memref<1xi32>) to memref<4xf32>
+             : (memref<4x1xf32>, memref<1xi32>) -> memref<4xf32>
     %dst0 = memref.reshape %src(%shape0)
-             : (memref<4x1xf32>, memref<2xi32>) to memref<2x2xf32>
+             : (memref<4x1xf32>, memref<2xi32>) -> memref<2x2xf32>
     // Flatten unranked memref.
     %dst = memref.reshape %src(%shape)
-             : (memref<*xf32>, memref<1xi32>) to memref<?xf32>
+             : (memref<*xf32>, memref<1xi32>) -> memref<?xf32>
     ```
 
     b. Source type is ranked or unranked. Shape argument has dynamic size.
@@ -1709,10 +1711,10 @@ def MemRef_ReshapeOp: MemRef_Op<"reshape", [
     ```mlir
     // Reshape dynamically-shaped 1D memref.
     %dst = memref.reshape %src(%shape)
-             : (memref<?xf32>, memref<?xi32>) to memref<*xf32>
+             : (memref<?xf32>, memref<?xi32>) -> memref<*xf32>
     // Reshape unranked memref.
     %dst = memref.reshape %src(%shape)
-             : (memref<*xf32>, memref<?xi32>) to memref<*xf32>
+             : (memref<*xf32>, memref<?xi32>) -> memref<*xf32>
     ```
   }];
 

From 23ef59a1d35080b26f379823ead29eb1b706ff7a Mon Sep 17 00:00:00 2001
From: Mike <FruitClover@gmail.com>
Date: Sat, 1 Nov 2025 04:25:44 +0300
Subject: [PATCH 17/20] [mlir] Fix mlir-runner memref-reshape test with
 unranked inputs (#165902)

Were using ranked before.
---
 mlir/test/mlir-runner/memref-reshape.mlir | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/test/mlir-runner/memref-reshape.mlir b/mlir/test/mlir-runner/memref-reshape.mlir
index 8c17f1fd02358..b264e0285953f 100644
--- a/mlir/test/mlir-runner/memref-reshape.mlir
+++ b/mlir/test/mlir-runner/memref-reshape.mlir
@@ -65,8 +65,8 @@ func.func @reshape_ranked_memref_to_ranked(%input : memref<2x3xf32>,
 func.func @reshape_unranked_memref_to_ranked(%input : memref<2x3xf32>,
                                         %shape : memref<2xindex>) {
   %unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32>
-  %output = memref.reshape %input(%shape)
-                : (memref<2x3xf32>, memref<2xindex>) -> memref<?x?xf32>
+  %output = memref.reshape %unranked_input(%shape)
+                : (memref<*xf32>, memref<2xindex>) -> memref<?x?xf32>
 
   %unranked_output = memref.cast %output : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%unranked_output) : (memref<*xf32>) -> ()
@@ -95,8 +95,8 @@ func.func @reshape_unranked_memref_to_unranked(%input : memref<2x3xf32>,
                                           %shape : memref<2xindex>) {
   %unranked_input = memref.cast %input : memref<2x3xf32> to memref<*xf32>
   %dyn_size_shape = memref.cast %shape : memref<2xindex> to memref<?xindex>
-  %output = memref.reshape %input(%dyn_size_shape)
-                : (memref<2x3xf32>, memref<?xindex>) -> memref<*xf32>
+  %output = memref.reshape %unranked_input(%dyn_size_shape)
+                : (memref<*xf32>, memref<?xindex>) -> memref<*xf32>
 
   call @printMemrefF32(%output) : (memref<*xf32>) -> ()
   // CHECK: rank = 2 offset = 0 sizes = [3, 2] strides = [2, 1] data =

From cc271437553452ede002d871d32abc02084341a8 Mon Sep 17 00:00:00 2001
From: Zhaoxin Yang <yangzhaoxin@loongson.cn>
Date: Sat, 1 Nov 2025 09:29:00 +0800
Subject: [PATCH 18/20] [LoongArch] Make ceil,floor,trunc,roundeven legal for
 lsx/lasx (#165217)

---
 .../LoongArch/LoongArchISelLowering.cpp       |   8 +
 .../LoongArch/LoongArchLASXInstrInfo.td       |   6 +
 .../Target/LoongArch/LoongArchLSXInstrInfo.td |   5 +
 .../CodeGen/LoongArch/lasx/fp-rounding.ll     | 200 ++----------------
 .../test/CodeGen/LoongArch/lsx/fp-rounding.ll |  88 +-------
 5 files changed, 35 insertions(+), 272 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index a6de839de7c28..904aabed9a843 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -371,6 +371,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
                          ISD::SETUGE, ISD::SETUGT},
                         VT, Expand);
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN, VT, Legal);
     }
     setOperationAction(ISD::CTPOP, GRLenVT, Legal);
     setOperationAction(ISD::FCEIL, {MVT::f32, MVT::f64}, Legal);
@@ -453,6 +457,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
                          ISD::SETUGE, ISD::SETUGT},
                         VT, Expand);
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN, VT, Legal);
     }
   }
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index ca4ee5f89573a..610ba052fbdd5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -2424,6 +2424,12 @@ def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm),
 def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm),
           (XVPICKVE_D v4f64:$xj, (to_valid_timm timm:$imm))>;
 
+// Vector floating-point conversion
+defm : PatXrF<fceil, "XVFRINTRP">;
+defm : PatXrF<ffloor, "XVFRINTRM">;
+defm : PatXrF<ftrunc, "XVFRINTRZ">;
+defm : PatXrF<froundeven, "XVFRINTRNE">;
+
 // load
 def : Pat<(int_loongarch_lasx_xvld GPR:$rj, timm:$imm),
           (XVLD GPR:$rj, (to_valid_timm timm:$imm))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 92402baa0fa0f..64708421c4ed4 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -2552,6 +2552,11 @@ def : Pat<(f64 (froundeven FPR64:$fj)),
           (f64 (EXTRACT_SUBREG (VFRINTRNE_D (VREPLVEI_D
                (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), 0)), sub_64))>;
 
+defm : PatVrF<fceil, "VFRINTRP">;
+defm : PatVrF<ffloor, "VFRINTRM">;
+defm : PatVrF<ftrunc, "VFRINTRZ">;
+defm : PatVrF<froundeven, "VFRINTRNE">;
+
 // load
 def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm),
           (VLD GPR:$rj, (to_valid_timm timm:$imm))>;
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll
index 79407c3fd4c8b..fa5f27edf615e 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/fp-rounding.ll
@@ -7,38 +7,8 @@ define void @ceil_v8f32(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: ceil_v8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 5
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrp.s $vr1, $vr1
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 4
-; CHECK-NEXT:    vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT:    vfrintrp.s $vr2, $vr2
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 6
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrp.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 7
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrp.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 48
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 1
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrp.s $vr1, $vr1
-; CHECK-NEXT:    xvpickve.w $xr3, $xr0, 0
-; CHECK-NEXT:    vreplvei.w $vr3, $vr3, 0
-; CHECK-NEXT:    vfrintrp.s $vr3, $vr3
-; CHECK-NEXT:    vextrins.w $vr3, $vr1, 16
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 2
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrp.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr3, $vr1, 32
-; CHECK-NEXT:    xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT:    vfrintrp.s $vr0, $vr0
-; CHECK-NEXT:    vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT:    xvpermi.q $xr3, $xr2, 2
-; CHECK-NEXT:    xvst $xr3, $a0, 0
+; CHECK-NEXT:    xvfrintrp.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <8 x float>, ptr %a0
@@ -52,21 +22,7 @@ define void @ceil_v4f64(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: ceil_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvpickve.d $xr1, $xr0, 3
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrp.d $vr1, $vr1
-; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 2
-; CHECK-NEXT:    vreplvei.d $vr2, $vr2, 0
-; CHECK-NEXT:    vfrintrp.d $vr2, $vr2
-; CHECK-NEXT:    vextrins.d $vr2, $vr1, 16
-; CHECK-NEXT:    xvpickve.d $xr1, $xr0, 1
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrp.d $vr1, $vr1
-; CHECK-NEXT:    xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT:    vfrintrp.d $vr0, $vr0
-; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT:    xvpermi.q $xr0, $xr2, 2
+; CHECK-NEXT:    xvfrintrp.d $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -81,38 +37,8 @@ define void @floor_v8f32(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: floor_v8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 5
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrm.s $vr1, $vr1
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 4
-; CHECK-NEXT:    vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT:    vfrintrm.s $vr2, $vr2
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 6
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrm.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 7
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrm.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 48
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 1
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrm.s $vr1, $vr1
-; CHECK-NEXT:    xvpickve.w $xr3, $xr0, 0
-; CHECK-NEXT:    vreplvei.w $vr3, $vr3, 0
-; CHECK-NEXT:    vfrintrm.s $vr3, $vr3
-; CHECK-NEXT:    vextrins.w $vr3, $vr1, 16
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 2
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrm.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr3, $vr1, 32
-; CHECK-NEXT:    xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT:    vfrintrm.s $vr0, $vr0
-; CHECK-NEXT:    vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT:    xvpermi.q $xr3, $xr2, 2
-; CHECK-NEXT:    xvst $xr3, $a0, 0
+; CHECK-NEXT:    xvfrintrm.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <8 x float>, ptr %a0
@@ -126,21 +52,7 @@ define void @floor_v4f64(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: floor_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvpickve.d $xr1, $xr0, 3
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrm.d $vr1, $vr1
-; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 2
-; CHECK-NEXT:    vreplvei.d $vr2, $vr2, 0
-; CHECK-NEXT:    vfrintrm.d $vr2, $vr2
-; CHECK-NEXT:    vextrins.d $vr2, $vr1, 16
-; CHECK-NEXT:    xvpickve.d $xr1, $xr0, 1
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrm.d $vr1, $vr1
-; CHECK-NEXT:    xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT:    vfrintrm.d $vr0, $vr0
-; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT:    xvpermi.q $xr0, $xr2, 2
+; CHECK-NEXT:    xvfrintrm.d $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -155,38 +67,8 @@ define void @trunc_v8f32(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: trunc_v8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 5
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrz.s $vr1, $vr1
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 4
-; CHECK-NEXT:    vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT:    vfrintrz.s $vr2, $vr2
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 6
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrz.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 7
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrz.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 48
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 1
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrz.s $vr1, $vr1
-; CHECK-NEXT:    xvpickve.w $xr3, $xr0, 0
-; CHECK-NEXT:    vreplvei.w $vr3, $vr3, 0
-; CHECK-NEXT:    vfrintrz.s $vr3, $vr3
-; CHECK-NEXT:    vextrins.w $vr3, $vr1, 16
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 2
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrz.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr3, $vr1, 32
-; CHECK-NEXT:    xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT:    vfrintrz.s $vr0, $vr0
-; CHECK-NEXT:    vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT:    xvpermi.q $xr3, $xr2, 2
-; CHECK-NEXT:    xvst $xr3, $a0, 0
+; CHECK-NEXT:    xvfrintrz.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <8 x float>, ptr %a0
@@ -200,21 +82,7 @@ define void @trunc_v4f64(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: trunc_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvpickve.d $xr1, $xr0, 3
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrz.d $vr1, $vr1
-; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 2
-; CHECK-NEXT:    vreplvei.d $vr2, $vr2, 0
-; CHECK-NEXT:    vfrintrz.d $vr2, $vr2
-; CHECK-NEXT:    vextrins.d $vr2, $vr1, 16
-; CHECK-NEXT:    xvpickve.d $xr1, $xr0, 1
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrz.d $vr1, $vr1
-; CHECK-NEXT:    xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT:    vfrintrz.d $vr0, $vr0
-; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT:    xvpermi.q $xr0, $xr2, 2
+; CHECK-NEXT:    xvfrintrz.d $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -229,38 +97,8 @@ define void @roundeven_v8f32(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: roundeven_v8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 5
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrne.s $vr1, $vr1
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 4
-; CHECK-NEXT:    vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT:    vfrintrne.s $vr2, $vr2
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 6
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrne.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 7
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrne.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 48
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 1
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrne.s $vr1, $vr1
-; CHECK-NEXT:    xvpickve.w $xr3, $xr0, 0
-; CHECK-NEXT:    vreplvei.w $vr3, $vr3, 0
-; CHECK-NEXT:    vfrintrne.s $vr3, $vr3
-; CHECK-NEXT:    vextrins.w $vr3, $vr1, 16
-; CHECK-NEXT:    xvpickve.w $xr1, $xr0, 2
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrne.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr3, $vr1, 32
-; CHECK-NEXT:    xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
-; CHECK-NEXT:    vfrintrne.s $vr0, $vr0
-; CHECK-NEXT:    vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT:    xvpermi.q $xr3, $xr2, 2
-; CHECK-NEXT:    xvst $xr3, $a0, 0
+; CHECK-NEXT:    xvfrintrne.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <8 x float>, ptr %a0
@@ -274,21 +112,7 @@ define void @roundeven_v4f64(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: roundeven_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvpickve.d $xr1, $xr0, 3
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrne.d $vr1, $vr1
-; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 2
-; CHECK-NEXT:    vreplvei.d $vr2, $vr2, 0
-; CHECK-NEXT:    vfrintrne.d $vr2, $vr2
-; CHECK-NEXT:    vextrins.d $vr2, $vr1, 16
-; CHECK-NEXT:    xvpickve.d $xr1, $xr0, 1
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrne.d $vr1, $vr1
-; CHECK-NEXT:    xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT:    vfrintrne.d $vr0, $vr0
-; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
-; CHECK-NEXT:    xvpermi.q $xr0, $xr2, 2
+; CHECK-NEXT:    xvfrintrne.d $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll
index 1ca6290a2239b..cb01ac0358ab3 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/fp-rounding.ll
@@ -7,22 +7,8 @@ define void @ceil_v4f32(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: ceil_v4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrp.s $vr1, $vr1
-; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 0
-; CHECK-NEXT:    vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT:    vfrintrp.s $vr2, $vr2
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 2
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrp.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
 ; CHECK-NEXT:    vfrintrp.s $vr0, $vr0
-; CHECK-NEXT:    vextrins.w $vr2, $vr0, 48
-; CHECK-NEXT:    vst $vr2, $a0, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <4 x float>, ptr %a0
@@ -36,13 +22,7 @@ define void @ceil_v2f64(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: ceil_v2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrp.d $vr1, $vr1
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
 ; CHECK-NEXT:    vfrintrp.d $vr0, $vr0
-; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -57,22 +37,8 @@ define void @floor_v4f32(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: floor_v4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrm.s $vr1, $vr1
-; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 0
-; CHECK-NEXT:    vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT:    vfrintrm.s $vr2, $vr2
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 2
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrm.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
 ; CHECK-NEXT:    vfrintrm.s $vr0, $vr0
-; CHECK-NEXT:    vextrins.w $vr2, $vr0, 48
-; CHECK-NEXT:    vst $vr2, $a0, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <4 x float>, ptr %a0
@@ -86,13 +52,7 @@ define void @floor_v2f64(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: floor_v2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrm.d $vr1, $vr1
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
 ; CHECK-NEXT:    vfrintrm.d $vr0, $vr0
-; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -107,22 +67,8 @@ define void @trunc_v4f32(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: trunc_v4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrz.s $vr1, $vr1
-; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 0
-; CHECK-NEXT:    vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT:    vfrintrz.s $vr2, $vr2
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 2
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrz.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
 ; CHECK-NEXT:    vfrintrz.s $vr0, $vr0
-; CHECK-NEXT:    vextrins.w $vr2, $vr0, 48
-; CHECK-NEXT:    vst $vr2, $a0, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <4 x float>, ptr %a0
@@ -136,13 +82,7 @@ define void @trunc_v2f64(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: trunc_v2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrz.d $vr1, $vr1
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
 ; CHECK-NEXT:    vfrintrz.d $vr0, $vr0
-; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -157,22 +97,8 @@ define void @roundeven_v4f32(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: roundeven_v4f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrne.s $vr1, $vr1
-; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 0
-; CHECK-NEXT:    vreplvei.w $vr2, $vr2, 0
-; CHECK-NEXT:    vfrintrne.s $vr2, $vr2
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 16
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 2
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrne.s $vr1, $vr1
-; CHECK-NEXT:    vextrins.w $vr2, $vr1, 32
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
 ; CHECK-NEXT:    vfrintrne.s $vr0, $vr0
-; CHECK-NEXT:    vextrins.w $vr2, $vr0, 48
-; CHECK-NEXT:    vst $vr2, $a0, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <4 x float>, ptr %a0
@@ -186,13 +112,7 @@ define void @roundeven_v2f64(ptr %res, ptr %a0) nounwind {
 ; CHECK-LABEL: roundeven_v2f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    vfrintrne.d $vr1, $vr1
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
 ; CHECK-NEXT:    vfrintrne.d $vr0, $vr0
-; CHECK-NEXT:    vextrins.d $vr0, $vr1, 16
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:

From a9431327615118afe84ee17444019c0ba99c744a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 31 Oct 2025 20:05:02 -0700
Subject: [PATCH 19/20] [VPlan] Add VPRegionBlock::getCanonicalIVType (NFC).
 (#164127)

Split off from https://github.com/llvm/llvm-project/pull/156262.

Similar to VPRegionBlock::getCanonicalIV, add helper to get the type of
the canonical IV, in preparation for removing VPCanonicalIVPHIRecipe.

PR: https://github.com/llvm/llvm-project/pull/164127
---
 llvm/lib/Transforms/Vectorize/VPlan.h             |  6 ++++++
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp    |  3 +--
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 14 +++++++-------
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp     |  3 +--
 4 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1f10058ab4a9a..1504acfcf7e52 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4109,6 +4109,12 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
   const VPCanonicalIVPHIRecipe *getCanonicalIV() const {
     return const_cast<VPRegionBlock *>(this)->getCanonicalIV();
   }
+
+  /// Return the type of the canonical IV for loop regions.
+  Type *getCanonicalIVType() { return getCanonicalIV()->getScalarType(); }
+  const Type *getCanonicalIVType() const {
+    return getCanonicalIV()->getScalarType();
+  }
 };
 
 inline VPRegionBlock *VPRecipeBase::getRegion() {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bde62dd6dd4bc..f9c15a31167fa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2372,9 +2372,8 @@ bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
     return false;
   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
-  auto *CanIV = getRegion()->getCanonicalIV();
   return StartC && StartC->isZero() && StepC && StepC->isOne() &&
-         getScalarType() == CanIV->getScalarType();
+         getScalarType() == getRegion()->getCanonicalIVType();
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 986c801abf684..d491d5669ef18 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -820,7 +820,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
   // Calculate the final index.
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   auto *CanonicalIV = LoopRegion->getCanonicalIV();
-  Type *CanonicalIVType = CanonicalIV->getScalarType();
+  Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
   VPBuilder B(cast<VPBasicBlock>(PredVPBB));
 
   DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
@@ -2402,8 +2402,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
       "index.part.next");
 
   // Create the active lane mask instruction in the VPlan preheader.
-  VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
-      ConstantInt::get(TopRegion->getCanonicalIV()->getScalarType(), 1));
+  VPValue *ALMMultiplier =
+      Plan.getOrAddLiveIn(ConstantInt::get(TopRegion->getCanonicalIVType(), 1));
   auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
                                         {EntryIncrement, TC, ALMMultiplier}, DL,
                                         "active.lane.mask.entry");
@@ -2503,7 +2503,7 @@ void VPlanTransforms::addActiveLaneMask(
   } else {
     VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
     VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
-        ConstantInt::get(LoopRegion->getCanonicalIV()->getScalarType(), 1));
+        ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
     LaneMask =
         B.createNaryOp(VPInstruction::ActiveLaneMask,
                        {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
@@ -2775,7 +2775,7 @@ void VPlanTransforms::addExplicitVectorLength(
   VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
 
   auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
-  auto *CanIVTy = CanonicalIVPHI->getScalarType();
+  auto *CanIVTy = LoopRegion->getCanonicalIVType();
   VPValue *StartV = CanonicalIVPHI->getStartValue();
 
   // Create the ExplicitVectorLengthPhi recipe in the main loop.
@@ -4336,10 +4336,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
   VPBuilder PHBuilder(Plan.getVectorPreheader());
 
   VPValue *UF = Plan.getOrAddLiveIn(
-      ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
+      ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
   if (VF.isScalable()) {
     VPValue *VScale = PHBuilder.createElementCount(
-        CanIV->getScalarType(), ElementCount::getScalable(1));
+        VectorLoop->getCanonicalIVType(), ElementCount::getScalable(1));
     VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
     Inc->setOperand(1, VScaleUF);
     Plan.getVF().replaceAllUsesWith(VScale);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index cfd1a741ee841..f15113c6293bc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -69,8 +69,7 @@ class UnrollState {
                                 VPBasicBlock::iterator InsertPtForPhi);
 
   VPValue *getConstantVPV(unsigned Part) {
-    Type *CanIVIntTy =
-        Plan.getVectorLoopRegion()->getCanonicalIV()->getScalarType();
+    Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
     return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
   }
 

From caaa39539deb7d219f4338c6006863566488a53a Mon Sep 17 00:00:00 2001
From: Ron Lieberman <ron.lieberman@amd.com>
Date: Fri, 31 Oct 2025 22:15:42 -0500
Subject: [PATCH 20/20] Regen
 llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll

---
 .../alternate-opcode-strict-bitwidth-than-main.ll  | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
index cc2e16e2b099b..959b2350d9d78 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-opcode-strict-bitwidth-than-main.ll
@@ -6,14 +6,12 @@ define float @test(i8 %0) {
 ; CHECK-SAME: i8 [[TMP0:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> <i8 poison, i8 0>, i8 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i16>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i16> [[TMP2]], <i16 2, i16 27>
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i16> [[TMP2]], <i16 2, i16 27>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP9]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i32> [[TMP2]], <i32 2, i32 27>
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i32> [[TMP2]], <i32 2, i32 27>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    switch i32 [[TMP8]], label %[[EXIT:.*]] [
 ; CHECK-NEXT:      i32 0, label %[[EXIT]]