diff --git a/src/hotspot/share/opto/compile.cpp b/src/hotspot/share/opto/compile.cpp
index 621ba684da114..eb396ed7873b4 100644
--- a/src/hotspot/share/opto/compile.cpp
+++ b/src/hotspot/share/opto/compile.cpp
@@ -73,6 +73,8 @@
 #include "opto/opcodes.hpp"
 #include "opto/output.hpp"
 #include "opto/parse.hpp"
+#include "opto/phase.hpp"
+#include "opto/phaseloadfolding.hpp"
 #include "opto/phaseX.hpp"
 #include "opto/rootnode.hpp"
 #include "opto/runtime.hpp"
@@ -2402,6 +2404,19 @@ void Compile::Optimize() {
 
   if (failing())  return;
 
+  {
+    // This phase is much faster than EA, so doing it before EA reduces the work of EA by reducing
+    // the number of loads. It also helps EA terminate sooner because folded loads may expose
+    // further EA opportunities, and it is better if an EA opportunity is revealed from the
+    // beginning than if it is only revealed after some rounds of EA.
+    TracePhase tp(_t_loadFolding);
+    PhaseLoadFolding load_folding(igvn);
+    load_folding.optimize();
+    if (failing()) {
+      return;
+    }
+  }
+
   if (has_loops()) {
     print_method(PHASE_BEFORE_LOOP_OPTS, 2);
   }
diff --git a/src/hotspot/share/opto/phase.cpp b/src/hotspot/share/opto/phase.cpp
index 5603033ce69d8..a8dc270392d92 100644
--- a/src/hotspot/share/opto/phase.cpp
+++ b/src/hotspot/share/opto/phase.cpp
@@ -68,6 +68,7 @@ void Phase::print_timers() {
       tty->print_cr ("           Conn Graph:          %7.3f s", timers[_t_connectionGraph].seconds());
       tty->print_cr ("           Macro Eliminate:     %7.3f s", timers[_t_macroEliminate].seconds());
     }
+    tty->print_cr ("         Load Folding:        %7.3f s", timers[_t_loadFolding].seconds());
     tty->print_cr ("         GVN 1:               %7.3f s", timers[_t_iterGVN].seconds());
 
     {
@@ -100,6 +101,7 @@ void Phase::print_timers() {
 
     double other = timers[_t_optimizer].seconds() -
       (timers[_t_escapeAnalysis].seconds() +
+       timers[_t_loadFolding].seconds() +
        timers[_t_iterGVN].seconds() +
        timers[_t_incrInline].seconds() +
        timers[_t_vector].seconds() +
diff --git a/src/hotspot/share/opto/phase.hpp b/src/hotspot/share/opto/phase.hpp
index 6700df6ec177e..194b5a5c18027 100644
--- a/src/hotspot/share/opto/phase.hpp
+++ b/src/hotspot/share/opto/phase.hpp
@@ -50,6 +50,7 @@ class Phase : public StackObj {
     Remove_Useless_And_Renumber_Live, // First, remove useless nodes from the graph. Then, renumber live nodes.
     Optimistic,                       // Optimistic analysis phase
     GVN,                              // Pessimistic global value numbering phase
+    LoadFolding,                      // Aggressively look through loads
     Ins_Select,                       // Instruction selection phase
     CFG,                              // Build a CFG
     BlockLayout,                      // Linear ordering of blocks
@@ -73,6 +74,7 @@ class Phase : public StackObj {
     f(     _t_escapeAnalysis,        "escapeAnalysis")           \
     f(       _t_connectionGraph,     "connectionGraph")          \
     f(       _t_macroEliminate,      "macroEliminate")           \
+    f(     _t_loadFolding,           "loadFolding")              \
     f(     _t_iterGVN,               "iterGVN")                  \
     f(     _t_incrInline,            "incrementalInline")        \
     f(       _t_incrInline_ideal,    "incrementalInline_ideal")  \
diff --git a/src/hotspot/share/opto/phaseloadfolding.cpp b/src/hotspot/share/opto/phaseloadfolding.cpp
new file mode 100644
index 0000000000000..ae3b28268e194
--- /dev/null
+++ b/src/hotspot/share/opto/phaseloadfolding.cpp
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "libadt/vectset.hpp"
+#include "memory/resourceArea.hpp"
+#include "opto/addnode.hpp"
+#include "opto/callnode.hpp"
+#include "opto/cfgnode.hpp"
+#include "opto/compile.hpp"
+#include "opto/memnode.hpp"
+#include "opto/mulnode.hpp"
+#include "opto/node.hpp"
+#include "opto/phaseloadfolding.hpp"
+#include "utilities/globalDefinitions.hpp"
+
+void PhaseLoadFolding::optimize() {
+  ciEnv* env = C->env();
+  if (env->should_retain_local_variables() || env->jvmti_can_walk_any_space()) {
+    // Give up because JVMTI can do wonders
+    return;
+  }
+
+  // This phase is very fast, but it is still preferable not to allow potential unbounded
+  // iterations
+  for (int i = 0; i < 10; i++) {
+    bool progress = do_optimize();
+    if (!progress) {
+      return;
+    }
+    _igvn.optimize();
+    if (C->failing()) {
+      return;
+    }
+  }
+}
+
+// The escape status of a node is visible in the memory graph. That is, at runtime, if a load 'l'
+// from an object 'o' must be executed after an action 'a' that allows 'o' to escape, and in the
+// IR graph, the node 'L' corresponding to 'l' consumes the address 'O' + c, with 'O' being the
+// node corresponding to the newly allocated object 'o', then there must be a path along the
+// use-def edges from 'L' to the node 'A' that corresponds to 'a'.
+//
+// - If 'a' is a method invocation that receives 'o' as an argument, then in the graph, 'A' kills
+//   all memory. As a result, the memory input of 'L' must be a transitive use of 'A'. This is
+//   because in a well-behave memory graph, there is always a path of use-def edges from a memory
+//   node to the previous bottom memory node. This is important as it ensures memory fences can
+//   serialize memory operations by imposing use-def dependencies between the fence and the
+//   surrounding memory nodes.
+//   Example:
+//       Integer o = new Integer(v);
+//       int x = o.value;
+//       if (flag) {
+//         consume(o);
+//         int y = o.value;
+//       } else {
+//         consume(null);
+//         int z = o.value;
+//       }
+//       int t = o.value;
+//   The memory graph will then look like:
+//        NarrowMemProj (Integer.value)
+//          |          |
+//          |          |
+//     CallJava(o) CallJava(null)
+//          |          |
+//          |          |
+//        Proj1      Proj2
+//           \       /
+//            \     /
+//              Phi
+//   We can see that the object can be considered non-escape at NarrowMemProj, CallJava(null), and
+//   Proj2, while it is considered escape at CallJava(o), Proj1, Phi. The loads x and z will be
+//   from NarrowMemProj and Proj2, respectively, which means they can be considered loads from an
+//   object that has not escaped, and we can fold them to v. On the other hand, the loads y and t
+//   are from Proj1 and Phi, respectively, which means we cannot assume that the only value they
+//   can see is v.
+//
+// - If 'a' is a store of 'o' into the memory, then 'l' must be executed after a iff:
+//   + There is a memory fence that prevents 'l' from being executed before 'a'. Since a memory
+//     fence kills all memory, the node 'F' corresponding to that fence must be a transitive use of
+//     'A', and the memory input of 'L' must be a transitive use of 'F', similar to case 1.
+//   + There is a data dependency between 'l' and 'a'. In this case, there must be a path of
+//     use-def edges from 'L' to 'A'.
+//     For example:
+//       Integer o = new Integer(v);
+//       *p = o;
+//       Integer o_cloned = *p;
+//       o_clone.value = u;
+//       int x = o.value;
+//     Then, there is a path of use-def edges:
+//            Load(x = o.value)
+//                   | (MemNode::Memory)
+//                   v
+//         Store(o_clone.value = u)
+//                   | (MemNode::Address)
+//                   V
+//           Load(o_clone = *p)
+//                   | (MemNode::Memory)
+//                   v
+//              Store(*p = o)
+//     We can see that, we cannot fold x to v, because it must observe the value u, and we can
+//     correcly detect that the object O has escaped by following the outputs of the store that
+//     allows o to escape.
+//
+//   It is important to remind that even if 'l' is scheduled after the store 'a', unless there is a
+//   memory fence between 'l' and 'a', it is generally not required that 'l' is executed after 'a'.
+//   For example:
+//     1.  Integer o = new Integer(v);
+//         *p = o;
+//         int x = o.value;
+//     In this case, even if the load x = o.value is declared after the store of o to p that allows o
+//     to escape, it is valid for the load to actually happen before the store. If the developer
+//     wants to ensure that the order in which the memory accesses appear in the program is the same
+//     as the order they are executed, memory barriers (e.g. a store-load barrier) must be placed
+//     between them. As a result, we can consider x = o.value to be a load from an object that has
+//     not escaped, and fold it to v.
+//     2.  boolean b1, b2;
+//         Point o = new Point(v1, v2);
+//         int r;
+//         if (b1) {
+//           *p = o;
+//         } else {
+//           *q = o;
+//         }
+//         if (b2) {
+//           r = o.x;
+//         } else {
+//           r = o.y;
+//         }
+//     In this case, even if the control flow forces the loads to be scheduled after the stores
+//     that allow o to escape, without actual memory barriers, the JMM does not require the CPU to
+//     execute the loads after the stores (e.g. the loads are in cache so they can be executed
+//     sooner while the stores need to wait for the acquisition of the corresponding cache lines).
+//     As a result, we can consider those loads to be from an object that has not escaped, and fold
+//     o.x to v1 and o.y to v2.
+bool PhaseLoadFolding::do_optimize() {
+  bool progress = false;
+  for (int macro_idx = 0; macro_idx < C->macro_count(); macro_idx++) {
+    Node* macro = C->macro_node(macro_idx);
+    if (!macro->is_Allocate()) {
+      continue;
+    }
+
+    AllocateNode* alloc = macro->as_Allocate();
+    Node* oop = alloc->result_cast();
+    if (oop == nullptr) {
+      continue;
+    }
+
+    if (process_allocate_result(oop)) {
+      progress = true;
+    }
+  }
+  return progress;
+}
+
+// Find all loads from oop that have not observed the escape of oop, and try to find their
+// corresponding stores
+bool PhaseLoadFolding::process_allocate_result(Node* oop) {
+  ResourceMark rm;
+  Unique_Node_List candidates;
+  VectorSet candidate_set;
+
+  collect_loads(candidates, candidate_set, oop);
+  if (candidate_set.is_empty()) {
+    return false;
+  }
+
+  WorkLists work_lists;
+  process_candidates(candidate_set, work_lists, oop);
+  if (candidate_set.is_empty()) {
+    return false;
+  }
+
+  bool progress = false;
+  for (uint candidate_idx = 0; candidate_idx < candidates.size(); candidate_idx++) {
+    LoadNode* candidate = candidates.at(candidate_idx)->as_Load();
+    if (!candidate_set.test(candidate->_idx)) {
+      continue;
+    }
+
+    work_lists.results.clear();
+    Node* folded_value = try_fold_recursive(oop, candidate, candidate->in(MemNode::Memory), work_lists);
+    if (folded_value != nullptr) {
+      progress = true;
+      _igvn.replace_node(candidate, folded_value);
+    }
+  }
+  return progress;
+}
+
+// Collect all loads from oop
+void PhaseLoadFolding::collect_loads(Unique_Node_List& candidates, VectorSet& candidate_set, Node* oop) {
+  assert(candidates.size() == 0 && candidate_set.is_empty(), "must start with no candidates");
+  for (DUIterator_Fast oop_out_max, oop_out_idx = oop->fast_outs(oop_out_max); oop_out_idx < oop_out_max; oop_out_idx++) {
+    Node* out = oop->fast_out(oop_out_idx);
+    if (!out->is_AddP()) {
+      continue;
+    }
+
+    if (out->in(AddPNode::Base) != oop || out->in(AddPNode::Address) != oop || !out->in(AddPNode::Offset)->is_Con()) {
+      // Only try to fold loads in the form of oop + C
+      continue;
+    }
+
+    for (DUIterator_Fast addp_out_max, addp_out_idx = out->fast_outs(addp_out_max); addp_out_idx < addp_out_max; addp_out_idx++) {
+      Node* addp_out = out->fast_out(addp_out_idx);
+      if (addp_out->is_Load() && !addp_out->as_Load()->is_mismatched_access()) {
+        candidates.push(addp_out);
+      }
+    }
+  }
+
+  for (uint i = 0; i < candidates.size(); i++) {
+    candidate_set.set(candidates.at(i)->_idx);
+  }
+}
+
+// Find all nodes that observe the escape of oop. This function also finds stores that may store
+// into oop. This is tricky, for example:
+//     Integer o = new Integer(v);
+//     Integer phi = o;
+//     if (b) {
+//       phi = new Integer(0);
+//     }
+//     phi.value = 1;
+// Then, the store phi.value = 1 may or may not modify o, this cannot be known at compile time. As
+// a result, when we walk the memory graph from a load, if we encounter such a store, we cannot
+// know if it is the value we are looking for, and must give up.
+void PhaseLoadFolding::process_candidates(VectorSet& candidate_set, WorkLists& work_lists, Node* oop) {
+  assert(work_lists.may_alias.is_empty() && work_lists.escapes.size() == 0 && work_lists.work_list.size() == 0, "must start with empty work lists");
+  work_lists.work_list.push(oop);
+  for (uint wl_idx = 0; wl_idx < work_lists.work_list.size(); wl_idx++) {
+    // At runtime, n may be the same as oop, or may be a different value
+    Node* n = work_lists.work_list.at(wl_idx);
+    for (DUIterator_Fast out_max, out_idx = n->fast_outs(out_max); out_idx < out_max; out_idx++) {
+      Node* out = n->fast_out(out_idx);
+      if (out->is_ConstraintCast() || out->is_DecodeN() || out->is_EncodeP() ||
+          out->is_Phi() || out->is_CMove()) {
+        // All things that can alias n
+        work_lists.work_list.push(out);
+      } else if (out->is_AddP()) {
+        AddPNode* addp = out->as_AddP();
+        assert(addp->base_node() == n, "unexpected base of an AddP");
+
+        // A store that may or may not modify a field of oop (e.g. a store into a Phi which has oop
+        // as one input, or a store into an element of oop at a variable index). This is
+        // conservative, that is it must be true if the store may modify a field of oop but is not
+        // in the form oop + C
+        bool may_alias = false;
+        if (out->in(AddPNode::Base) != oop || out->in(AddPNode::Address) != oop || !out->in(AddPNode::Offset)->is_Con()) {
+          // Not an oop + C pointer
+          may_alias = true;
+        }
+
+        for (DUIterator_Fast addp_out_max, addp_out_idx = addp->fast_outs(addp_out_max); addp_out_idx < addp_out_max; addp_out_idx++) {
+          Node* addp_out = addp->fast_out(addp_out_idx);
+          if ((addp_out->is_Store() || addp_out->is_LoadStore())) {
+            assert(addp == addp_out->in(MemNode::Address), "store a derived pointer?");
+            if (may_alias) {
+              work_lists.may_alias.set(addp_out->_idx);
+            }
+
+            if (addp_out->is_LoadStore() || addp_out->as_Store()->is_mismatched_access()) {
+              // Mismatched accesses are especially hard because they may lie in a different alias
+              // class, so we may not encounter them when walking the memory graph. As a result, be
+              // conservative and give up on all loads that may observe this store. LoadStores are
+              // also lumped here because there is no LoadStoreNode::is_mismatched_access.
+              work_lists.escapes.push(addp_out);
+            }
+          } else if (addp_out->is_Mem()) {
+            // A load, does not affect the memory
+          } else if (addp_out->is_AddP()) {
+            // Another AddP, it should share the base with the current addp, so it will be visited
+            // later
+            assert(addp_out->in(AddPNode::Base) == n, "must have the same base");
+          } else {
+            // Some runtime calls receive the pointer without the base
+            work_lists.escapes.push(addp_out);
+          }
+        }
+      } else if (out->is_Mem()) {
+        // A store that may allow oop to escape
+        if (out->req() > MemNode::ValueIn && n == out->in(MemNode::ValueIn)) {
+          work_lists.escapes.push(out);
+        }
+      } else if (out->is_Call()) {
+        // A call that may allow oop to escape
+        if (!out->is_AbstractLock() && out->as_Call()->has_non_debug_use(n)) {
+          work_lists.escapes.push(out);
+        }
+      } else if (out->is_SafePoint()) {
+        // Non-call safepoints are pure control nodes
+        continue;
+      } else {
+        // Be conservative with everything else
+        work_lists.escapes.push(out);
+      }
+    }
+  }
+
+  // Propagate the escape status, if a node observes oop escaping, then all of its users also
+  // observe that oop escapes
+  for (uint idx = 0; idx < work_lists.escapes.size(); idx++) {
+    Node* n = work_lists.escapes.at(idx);
+    candidate_set.remove(n->_idx);
+    if (candidate_set.is_empty()) {
+      return;
+    }
+
+    for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
+      Node* out = n->fast_out(i);
+      if (!out->is_Root()) {
+        work_lists.escapes.push(out);
+      }
+    }
+  }
+}
+
+// Try to find the store that a load observes. Since we know that oop has not escaped, we can
+// inspect the graph aggressively, ignoring calls and memory barriers.
+Node* PhaseLoadFolding::try_fold_recursive(Node* oop, LoadNode* candidate, Node* mem, WorkLists& work_lists) {
+  // An arbitrary int can be the input to a StoreB or a StoreC, the load needs to do the
+  // normalization
+  auto extract_store_value = [&](StoreNode* store) {
+    assert(store->Opcode() == candidate->store_Opcode(), "must match %s - %s", store->Name(), candidate->Name());
+    Node* res = store->in(MemNode::ValueIn);
+    if (candidate->Opcode() == Op_LoadUB) {
+      res = new AndINode(res, _igvn.intcon(0xFF));
+      _igvn.register_new_node_with_optimizer(res);
+    } else if (candidate->Opcode() == Op_LoadB) {
+      res = new LShiftINode(res, _igvn.intcon(24));
+      _igvn.register_new_node_with_optimizer(res);
+      res = new RShiftINode(res, _igvn.intcon(24));
+      _igvn.register_new_node_with_optimizer(res);
+    } else if (candidate->Opcode() == Op_LoadUS) {
+      res = new AndINode(res, _igvn.intcon(0xFFFF));
+      _igvn.register_new_node_with_optimizer(res);
+    } else if (candidate->Opcode() == Op_LoadS) {
+      res = new LShiftINode(res, _igvn.intcon(16));
+      _igvn.register_new_node_with_optimizer(res);
+      res = new RShiftINode(res, _igvn.intcon(16));
+      _igvn.register_new_node_with_optimizer(res);
+    }
+    return res;
+  };
+
+  Node* ptr = candidate->in(MemNode::Address);
+  int alias_idx = C->get_alias_index(_igvn.type(ptr)->is_ptr());
+  while (true) {
+    // We may encounter a memory loop, so recording Phis are necessary
+    if (work_lists.results.length() > int(mem->_idx)) {
+      Node* res = work_lists.results.at(mem->_idx);
+      if (res != nullptr) {
+        return res;
+      }
+    }
+
+    // If we encounter a store that we cannot decide if it modifies the memory candidate loads
+    // from, give up
+    if (work_lists.may_alias.test(mem->_idx)) {
+      return nullptr;
+    }
+
+    if (mem->is_MergeMem()) {
+      mem = mem->as_MergeMem()->memory_at(alias_idx);
+    } else if (mem->is_Phi()) {
+      // Create a Phi for the result and store it in work_lists.results, this allows working with
+      // cycles
+      PhiNode* res = new PhiNode(mem->in(0), candidate->bottom_type());
+      _igvn.register_new_node_with_optimizer(res);
+      work_lists.results.at_put_grow(mem->_idx, res);
+      for (uint i = 1; i < mem->req(); i++) {
+        Node* phi_in = try_fold_recursive(oop, candidate, mem->in(i), work_lists);
+        if (phi_in == nullptr) {
+          return nullptr;
+        }
+
+        res->init_req(i, phi_in);
+      }
+      return res;
+    } else if (mem->is_Proj()) {
+      mem = mem->in(0);
+    } else if (mem->is_MemBar()) {
+      // Look through MemBars, only stop at the InitializeNode of oop
+      if (!mem->is_Initialize() || mem != oop->in(0)->in(0)) {
+        mem = mem->in(TypeFunc::Memory);
+        continue;
+      }
+
+      InitializeNode* init = mem->as_Initialize();
+      assert(ptr->is_AddP() && ptr->in(AddPNode::Base) == oop && ptr->in(AddPNode::Address) == oop && ptr->in(AddPNode::Offset)->is_Con(),
+             "invalid pointer");
+
+#ifdef _LP64
+      Node* res = init->find_captured_store(ptr->in(AddPNode::Offset)->get_long(), candidate->memory_size(), &_igvn);
+#else // _LP64
+      Node* res = init->find_captured_store(ptr->in(AddPNode::Offset)->get_int(), candidate->memory_size(), &_igvn);
+#endif // _LP64
+      if (res == nullptr) {
+        return nullptr;
+      } else if (res->is_Proj() && res->in(0) == init->allocation()) {
+        // Failure to find a captured store will return the memory output of the AllocateNode
+        return _igvn.zerocon(candidate->value_basic_type());
+      } else if (res->Opcode() == candidate->store_Opcode()) {
+        return extract_store_value(res->as_Store());
+      } else {
+        return nullptr;
+      }
+    } else if (mem->is_SafePoint()) {
+      mem = mem->in(TypeFunc::Memory);
+    } else if (mem->is_Store()) {
+      // We discarded all stores that may write into this field but does not have the form oop + C,
+      // so a simple comparison of the address input is enough
+      if (ptr == mem->in(MemNode::Address)) {
+        return extract_store_value(mem->as_Store());
+      } else {
+        mem = mem->in(MemNode::Memory);
+      }
+    } else {
+      return nullptr;
+    }
+  }
+}
diff --git a/src/hotspot/share/opto/phaseloadfolding.hpp b/src/hotspot/share/opto/phaseloadfolding.hpp
new file mode 100644
index 0000000000000..2f1c920a3c749
--- /dev/null
+++ b/src/hotspot/share/opto/phaseloadfolding.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_OPTO_PHASELOADFOLDING_HPP
+#define SHARE_OPTO_PHASELOADFOLDING_HPP
+
+#include "libadt/vectset.hpp"
+#include "opto/node.hpp"
+#include "opto/phase.hpp"
+#include "utilities/growableArray.hpp"
+
+class AllocateNode;
+class PhaseIterGVN;
+
+// Try to fold loads by finding the corresponding stores. The transformations here inspect the
+// graph more aggressively than during IterGVN, so it is a separate phase in the compilation
+// process. The loads taken into consideration are:
+//
+// 1. If an object has not escaped, then all modification must be visible in the graph. As a
+//    result, we can follow the memory input, skip through calls and memory fences to find a
+//    corresponding store.
+class PhaseLoadFolding : public Phase {
+private:
+  PhaseIterGVN& _igvn;
+
+  class WorkLists {
+  public:
+    VectorSet may_alias;
+    Unique_Node_List escapes;
+    Unique_Node_List work_list;
+    GrowableArray<Node*> results;
+  };
+
+  bool do_optimize();
+  bool process_allocate_result(Node* oop);
+  void collect_loads(Unique_Node_List& candidates, VectorSet& candidate_mems, Node* oop);
+  void process_candidates(VectorSet& candidate_mems, WorkLists& work_lists, Node* oop);
+  Node* try_fold_recursive(Node* oop, LoadNode* candidate, Node* mem, WorkLists& work_lists);
+
+public:
+  PhaseLoadFolding(PhaseIterGVN& igvn) : Phase(LoadFolding), _igvn(igvn) {}
+  void optimize();
+};
+
+#endif // SHARE_OPTO_PHASELOADFOLDING_HPP
diff --git a/test/hotspot/jtreg/compiler/escapeAnalysis/TestLoadFolding.java b/test/hotspot/jtreg/compiler/escapeAnalysis/TestLoadFolding.java
new file mode 100644
index 0000000000000..7e6afaa7ab76c
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/escapeAnalysis/TestLoadFolding.java
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.escapeAnalysis;
+
+import compiler.lib.ir_framework.*;
+
+import java.lang.invoke.VarHandle;
+
+/**
+ * @test
+ * @bug 8373495
+ * @summary Test that loads from a newly allocated object are aggressively folded if the object has not escaped
+ * @library /test/lib /
+ * @run driver ${test.main.class}
+ */
+public class TestLoadFolding {
+    public static class Point {
+        int x;
+        int y;
+
+        Point() {
+            x = 1;
+            y = 2;
+        }
+
+        static final Point DEFAULT = new Point();
+    }
+
+    static Point staticField;
+
+    public static void main(String[] args) {
+        TestFramework.run();
+    }
+
+    @Run(test = {"test11", "test12", "test13", "test14", "test15", "test16", "test17", "test18"})
+    public void runPositiveTests() {
+        test11();
+        test12(false);
+        test12(true);
+        test13(false);
+        test13(true);
+        test14();
+        test15(1, 16);
+        test16(1, 16, false);
+        test16(1, 16, true);
+        test17(0);
+        test18(0);
+    }
+
+    @Run(test = {"test01", "test02", "test03", "test04", "test05"})
+    public void runNegativeTests() {
+        test01();
+        test02(false);
+        test02(true);
+        test03(false);
+        test03(true);
+        test04(1, 16);
+        test05(0);
+    }
+
+    @DontInline
+    static void escape(Object o) {}
+
+    @Test
+    @IR(failOn = IRNode.LOAD_I, counts = {IRNode.ALLOC, "1"})
+    public Point test11() {
+        // p only escapes at return
+        Point p = new Point();
+        escape(null);
+        p.x += p.y;
+        return p;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_I, counts = {IRNode.ALLOC, "1"})
+    public Point test12(boolean b) {
+        // p escapes in another branch
+        Point p = new Point();
+        if (b) {
+            escape(p);
+        } else {
+            escape(null);
+            p.x += p.y;
+        }
+        return p;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_I, counts = {IRNode.ALLOC, "1"})
+    public Point test13(boolean b) {
+        // A Phi of p1 and Point.DEFAULT, but a store to Phi is after all the loads from p1
+        Point p1 = new Point();
+        Point p = b ? p1 : Point.DEFAULT;
+        escape(null);
+        p.x = p1.x + p1.y;
+        return p;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_I, counts = {IRNode.ALLOC, "1"})
+    public int test14() {
+        // Even if p escapes before the loads, if it is legal to execute the loads before the
+        // store, then we can fold the loads
+        Point p = new Point();
+        escape(null);
+        staticField = p;
+        return p.x + p.y;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_I, counts = {IRNode.ALLOC, "1"})
+    public Point test15(int begin, int end) {
+        // Fold the load that is a part of a cycle
+        Point p = new Point();
+        for (int i = begin; i < end; i *= 2) {
+            p.x++;
+            escape(null); // Force a memory Phi
+        }
+        p.x += p.y;
+        return p;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_I, counts = {IRNode.ALLOC, "1"})
+    public Point test16(int begin, int end, boolean b) {
+        // A cycle and a Phi, this time the store is at a different
+        Point p1 = new Point();
+        // This store is not on a Phi involving p1, so it does not interfere
+        Point.DEFAULT.y = 3;
+        Point p = p1;
+        for (int i = begin; i < end; i += 2) {
+            if (b) {
+                p = p1;
+            } else {
+                p = Point.DEFAULT;
+            }
+            b = !b;
+
+            p.x = p1.y + 3;
+            escape(null); // Force a memory Phi
+        }
+        p1.x = p1.y;
+        return p;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_I, "1", IRNode.ALLOC_ARRAY, "1"})
+    public int test17(int idx) {
+        // Array
+        int[] a = new int[2];
+        a[0] = 1;
+        a[1] = 2;
+        int res = a[idx & 1];
+        escape(null);
+        res += a[0] + a[1];
+        escape(a);
+        return res;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_I, counts = {IRNode.ALLOC_ARRAY, "1"})
+    public int test18(int idx) {
+        // Array, even if we will give up if we encounter a[idx & 1] = 3, we meet a[0] = 4 first,
+        // so the load int res = a[0] can still be folded
+        int[] a = new int[2];
+        a[0] = 1;
+        a[1] = 2;
+        escape(null);
+        a[idx & 1] = 3;
+        a[0] = 4;
+        escape(null);
+        int res = a[0];
+        escape(a);
+        return res;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_I, "2", IRNode.ALLOC, "1"})
+    public int test01() {
+        Point p = new Point();
+        staticField = p;
+        // Actually, the only fence that requires the following loads to be executed after the
+        // store is a fullFence
+        VarHandle.fullFence();
+        return p.x + p.y;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_I, "1", IRNode.ALLOC, "1"})
+    public int test02(boolean b) {
+        Point p = new Point();
+        if (b) {
+            escape(p);
+            // p escaped, so the load must not be removed
+            return p.x;
+        } else {
+            escape(null);
+            return 0;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_I, "1", IRNode.ALLOC, "1"})
+    public int test03(boolean b) {
+        Point p = new Point();
+        if (b) {
+            escape(p);
+        }
+        // p escaped, so the load must not be removed
+        return p.x;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_I, "> 0", IRNode.ALLOC, "1"})
+    public Point test04(int begin, int end) {
+        Point p = new Point();
+        for (int i = begin; i < end; i *= 2) {
+            // p escaped here because this is a loop
+            p.x++;
+            escape(p);
+        }
+        return p;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_I, "2", IRNode.ALLOC_ARRAY, "1"})
+    public int test05(int idx) {
+        int[] a = new int[2];
+        a[0] = 1;
+        a[1] = 2;
+        escape(null);
+        a[idx & 1] = 3;
+        // Cannot fold the loads because we do not know which element is written to by
+        // a[idx & 1] = 3
+        return a[0] + a[1];
+    }
+}