diff --git a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
index c385e82792113..9a38973db122a 100644
--- a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
@@ -51,11 +51,11 @@ EXTERN_C NATIVEAOT_API void __cdecl RhSpinWait(int32_t iterations)
     ASSERT(iterations > 0);
 
     // limit the spin count in coop mode.
-    ASSERT_MSG(iterations <= 10000 || !ThreadStore::GetCurrentThread()->IsCurrentThreadInCooperativeMode(),
+    ASSERT_MSG(iterations <= 1024 || !ThreadStore::GetCurrentThread()->IsCurrentThreadInCooperativeMode(),
         "This is too long wait for coop mode. You must p/invoke with GC transition.");
 
     YieldProcessorNormalizationInfo normalizationInfo;
-    YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, iterations);
+    YieldProcessorNormalized(normalizationInfo, iterations);
 }
 
 // Yield the cpu to another thread ready to process, if one is available.
diff --git a/src/coreclr/nativeaot/Runtime/thread.cpp b/src/coreclr/nativeaot/Runtime/thread.cpp
index fd70a59aef864..f55a96d9e9d55 100644
--- a/src/coreclr/nativeaot/Runtime/thread.cpp
+++ b/src/coreclr/nativeaot/Runtime/thread.cpp
@@ -260,7 +260,10 @@ void Thread::Construct()
     // alloc_context ever needs different initialization, a matching change to the tls_CurrentThread
     // static initialization will need to be made.
 
-    m_uPalThreadIdForLogging = PalGetCurrentThreadIdForLogging();
+    m_pTransitionFrame = TOP_OF_STACK_MARKER;
+    m_pDeferredTransitionFrame = TOP_OF_STACK_MARKER;
+    m_hPalThread = INVALID_HANDLE_VALUE;
+
     m_threadId.SetToCurrentThread();
 
     HANDLE curProcessPseudo = PalGetCurrentProcess();
@@ -328,7 +331,7 @@ bool Thread::CatchAtSafePoint()
 
 uint64_t Thread::GetPalThreadIdForLogging()
 {
-    return m_uPalThreadIdForLogging;
+    return *(uint64_t*)&m_threadId;
 }
 
 bool Thread::IsCurrentThread()
diff --git a/src/coreclr/nativeaot/Runtime/thread.h b/src/coreclr/nativeaot/Runtime/thread.h
index a83cc952199a4..138f8e998e49a 100644
--- a/src/coreclr/nativeaot/Runtime/thread.h
+++ b/src/coreclr/nativeaot/Runtime/thread.h
@@ -95,8 +95,7 @@ struct ThreadBuffer
     PTR_VOID                m_pStackLow;
     PTR_VOID                m_pStackHigh;
     PTR_UInt8               m_pTEB;                                 // Pointer to OS TEB structure for this thread
-    uint64_t                m_uPalThreadIdForLogging;               // @TODO: likely debug-only
-    EEThreadId              m_threadId;
+    EEThreadId              m_threadId;                             // OS thread ID
     PTR_VOID                m_pThreadStressLog;                     // pointer to head of thread's StressLogChunks
     NATIVE_CONTEXT*         m_interruptedContext;                   // context for an asynchronously interrupted thread.
 #ifdef FEATURE_SUSPEND_REDIRECTION
@@ -192,7 +191,7 @@ class Thread : private ThreadBuffer
     gc_alloc_context *  GetAllocContext();  // @TODO: I would prefer to not expose this in this way
 
 #ifndef DACCESS_COMPILE
-    uint64_t              GetPalThreadIdForLogging();
+    uint64_t            GetPalThreadIdForLogging();
     bool                IsCurrentThread();
 
     void                GcScanRoots(void * pfnEnumCallback, void * pvCallbackData);
diff --git a/src/coreclr/nativeaot/Runtime/threadstore.cpp b/src/coreclr/nativeaot/Runtime/threadstore.cpp
index 5b3aaf9b1e29d..edb374ac2442e 100644
--- a/src/coreclr/nativeaot/Runtime/threadstore.cpp
+++ b/src/coreclr/nativeaot/Runtime/threadstore.cpp
@@ -415,20 +415,9 @@ COOP_PINVOKE_HELPER(void, RhpCancelThreadAbort, (void* thread))
 
 C_ASSERT(sizeof(Thread) == sizeof(ThreadBuffer));
 
-EXTERN_C DECLSPEC_THREAD ThreadBuffer tls_CurrentThread;
-DECLSPEC_THREAD ThreadBuffer tls_CurrentThread =
-{
-    { 0 },                              // m_rgbAllocContextBuffer
-    Thread::TSF_Unknown,                // m_ThreadStateFlags
-    TOP_OF_STACK_MARKER,                // m_pTransitionFrame
-    TOP_OF_STACK_MARKER,                // m_pDeferredTransitionFrame
-    0,                                  // m_pCachedTransitionFrame
-    0,                                  // m_pNext
-    INVALID_HANDLE_VALUE,               // m_hPalThread
-    0,                                  // m_ppvHijackedReturnAddressLocation
-    0,                                  // m_pvHijackedReturnAddress
-    0,                                  // all other fields are initialized by zeroes
-};
+#ifndef _MSC_VER
+__thread ThreadBuffer tls_CurrentThread;
+#endif
 
 EXTERN_C ThreadBuffer* RhpGetThread()
 {
diff --git a/src/coreclr/nativeaot/Runtime/threadstore.inl b/src/coreclr/nativeaot/Runtime/threadstore.inl
index aa450b72fb448..29495046a9827 100644
--- a/src/coreclr/nativeaot/Runtime/threadstore.inl
+++ b/src/coreclr/nativeaot/Runtime/threadstore.inl
@@ -1,7 +1,12 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-EXTERN_C DECLSPEC_THREAD ThreadBuffer tls_CurrentThread;
+#ifdef _MSC_VER
+// a workaround to prevent tls_CurrentThread from becoming dynamically checked/initialized.
+EXTERN_C __declspec(selectany) __declspec(thread) ThreadBuffer tls_CurrentThread;
+#else
+EXTERN_C __thread ThreadBuffer tls_CurrentThread;
+#endif
 
 // static
 inline Thread * ThreadStore::RawGetCurrentThread()
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/Internal/Runtime/CompilerHelpers/SynchronizedMethodHelpers.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/Internal/Runtime/CompilerHelpers/SynchronizedMethodHelpers.cs
index b19f4622deaf6..2beff5cecf7e4 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/Internal/Runtime/CompilerHelpers/SynchronizedMethodHelpers.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/Internal/Runtime/CompilerHelpers/SynchronizedMethodHelpers.cs
@@ -14,20 +14,33 @@ internal static class SynchronizedMethodHelpers
         private static void MonitorEnter(object obj, ref bool lockTaken)
         {
             // Inlined Monitor.Enter with a few tweaks
-            Lock lck = Monitor.GetLock(obj);
+            int resultOrIndex = ObjectHeader.Acquire(obj);
+            if (resultOrIndex < 0)
+            {
+                lockTaken = true;
+                return;
+            }
+
+            Lock lck = resultOrIndex == 0 ?
+                ObjectHeader.GetLockObject(obj) :
+                SyncTable.GetLockObject(resultOrIndex);
+
             if (lck.TryAcquire(0))
             {
                 lockTaken = true;
                 return;
             }
+
             Monitor.TryAcquireContended(lck, obj, Timeout.Infinite);
             lockTaken = true;
         }
         private static void MonitorExit(object obj, ref bool lockTaken)
         {
             // Inlined Monitor.Exit with a few tweaks
-            if (!lockTaken) return;
-            Monitor.GetLock(obj).Release();
+            if (!lockTaken)
+                return;
+
+            ObjectHeader.Release(obj);
             lockTaken = false;
         }
 
@@ -35,21 +48,34 @@ private static void MonitorEnterStatic(IntPtr pEEType, ref bool lockTaken)
         {
             // Inlined Monitor.Enter with a few tweaks
             object obj = GetStaticLockObject(pEEType);
-            Lock lck = Monitor.GetLock(obj);
+            int resultOrIndex = ObjectHeader.Acquire(obj);
+            if (resultOrIndex < 0)
+            {
+                lockTaken = true;
+                return;
+            }
+
+            Lock lck = resultOrIndex == 0 ?
+                ObjectHeader.GetLockObject(obj) :
+                SyncTable.GetLockObject(resultOrIndex);
+
             if (lck.TryAcquire(0))
             {
                 lockTaken = true;
                 return;
             }
+
             Monitor.TryAcquireContended(lck, obj, Timeout.Infinite);
             lockTaken = true;
         }
         private static void MonitorExitStatic(IntPtr pEEType, ref bool lockTaken)
         {
             // Inlined Monitor.Exit with a few tweaks
-            if (!lockTaken) return;
+            if (!lockTaken)
+                return;
+
             object obj = GetStaticLockObject(pEEType);
-            Monitor.GetLock(obj).Release();
+            ObjectHeader.Release(obj);
             lockTaken = false;
         }
 
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Unix.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Unix.cs
index d7df4693019c7..98cb722517117 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Unix.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Unix.cs
@@ -9,8 +9,6 @@ namespace System
 {
     public static partial class Environment
     {
-        internal static int CurrentNativeThreadId => ManagedThreadId.Current;
-
         public static long TickCount64 => (long)RuntimeImports.RhpGetTickCount64();
 
         [DoesNotReturn]
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Windows.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Windows.cs
index e26fdcca48acb..9aaa5980fc68d 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Windows.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Environment.NativeAot.Windows.cs
@@ -7,8 +7,6 @@ namespace System
 {
     public static partial class Environment
     {
-        internal static int CurrentNativeThreadId => unchecked((int)Interop.Kernel32.GetCurrentThreadId());
-
         public static long TickCount64 => (long)Interop.Kernel32.GetTickCount64();
 
         [DoesNotReturn]
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/CompilerServices/RuntimeHelpers.NativeAot.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/CompilerServices/RuntimeHelpers.NativeAot.cs
index 5c6bf96ff7049..5b8a86cee2878 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/CompilerServices/RuntimeHelpers.NativeAot.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Runtime/CompilerServices/RuntimeHelpers.NativeAot.cs
@@ -94,17 +94,9 @@ public static object GetObjectValue(object? obj)
             return RuntimeImports.RhCompareObjectContentsAndPadding(o1, o2);
         }
 
-        [ThreadStatic]
-        private static int t_hashSeed;
-
         internal static int GetNewHashCode()
         {
-            int multiplier = Environment.CurrentManagedThreadId * 4 + 5;
-            // Every thread has its own generator for hash codes so that we won't get into a situation
-            // where two threads consistently give out the same hash codes.
-            // Choice of multiplier guarantees period of 2**32 - see Knuth Vol 2 p16 (3.2.1.2 Theorem A).
-            t_hashSeed = t_hashSeed * multiplier + 1;
-            return t_hashSeed;
+            return Random.Shared.Next();
         }
 
         public static unsafe int GetHashCode(object o)
@@ -228,6 +220,9 @@ internal static unsafe ushort GetElementSize(this Array array)
         internal static unsafe MethodTable* GetMethodTable(this object obj)
             => obj.m_pEEType;
 
+        internal static unsafe ref MethodTable* GetMethodTableRef(this object obj)
+            => ref obj.m_pEEType;
+
         internal static unsafe EETypePtr GetEETypePtr(this object obj)
             => new EETypePtr(obj.m_pEEType);
 
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/RuntimeExceptionHelpers.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/RuntimeExceptionHelpers.cs
index 6d146348ee816..465dbe83356a0 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/RuntimeExceptionHelpers.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/RuntimeExceptionHelpers.cs
@@ -464,7 +464,7 @@ private static void SerializeExceptionsForDump(Exception currentException, IntPt
             LowLevelList<Exception> exceptions = new LowLevelList<Exception>(curThreadExceptions);
             LowLevelList<Exception> nonThrownInnerExceptions = new LowLevelList<Exception>();
 
-            uint currentThreadId = (uint)Environment.CurrentNativeThreadId;
+            uint currentThreadId = (uint)Environment.CurrentManagedThreadId;
 
             // Reset nesting levels for exceptions on this thread that might not be currently in flight
             foreach (KeyValuePair<Exception, ExceptionData> item in s_exceptionDataTable)
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Lock.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Lock.cs
index fa81ae7a43684..1b2aea5b0653f 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Lock.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Lock.cs
@@ -2,6 +2,7 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using System.Diagnostics;
+using System.IO;
 using System.Runtime;
 using System.Runtime.CompilerServices;
 
@@ -10,17 +11,43 @@ namespace System.Threading
     [ReflectionBlocked]
     public sealed class Lock : IDisposable
     {
-        // The following constants define characteristics of spinning logic in the Lock class
-        private const int SpinningNotInitialized = 0;
-        private const int SpinningDisabled = -1;
-        private const int MaxSpinningValue = 10000;
+        //
+        // This lock is a hybrid spinning/blocking lock with dynamically adjusted spinning.
+        // On a multiprocessor machine an acquiring thread will try to acquire multiple times
+        // before going to sleep. The amount of spinning is dynamically adjusted based on past
+        // history of the lock and will stay in the following range.
+        //
+        private const uint MaxSpinLimit = 200;
+        private const uint MinSpinLimit = 10;
+        private const uint SpinningNotInitialized = MaxSpinLimit + 1;
+        private const uint SpinningDisabled = 0;
+
+        //
+        // We will use exponential backoff in rare cases when we need to change state atomically and cannot
+        // make progress due to concurrent state changes by other threads.
+        // While we cannot know the ideal amount of wait needed before making a successfull attempt,
+        // the exponential backoff will generally be not more than 2X worse than the perfect guess and
+        // will do a lot less attempts than an simple retry. On multiprocessor machine fruitless attempts
+        // will cause unnecessary sharing of the contended state which may make modifying the state more expensive.
+        // To protect against degenerate cases we will cap the per-iteration wait to 1024 spinwaits.
+        //
+        private const uint MaxExponentialBackoffBits = 10;
+
+        //
+        // This lock is unfair and permits acquiring a contended lock by a nonwaiter in the presence of waiters.
+        // It is possible for one thread to keep holding the lock long enough that waiters go to sleep and
+        // then release and reacquire fast enough that waiters have no chance to get the lock.
+        // In extreme cases one thread could keep retaking the lock starving everybody else.
+        // If we see woken waiters not able to take the lock for too long we will ask nonwaiters to wait.
+        //
+        private const uint WaiterWatchdogTicks = 100;
 
         //
         // NOTE: Lock must not have a static (class) constructor, as Lock itself is used to synchronize
         // class construction.  If Lock has its own class constructor, this can lead to infinite recursion.
-        // All static data in Lock must be pre-initialized.
+        // All static data in Lock must be lazy-initialized.
         //
-        private static int s_maxSpinCount;
+        internal static int s_processorCount;
 
         //
         // m_state layout:
@@ -30,28 +57,38 @@ public sealed class Lock : IDisposable
         // bit 1: True if we've set the event to wake a waiting thread.  The waiter resets this to false when it
         //        wakes up.  This avoids the overhead of setting the event multiple times.
         //
+        // bit 2: True if nonwaiters must not get ahead of waiters when acquiring a contended lock.
+        //
         // everything else: A count of the number of threads waiting on the event.
         //
+        private const int Uncontended = 0;
         private const int Locked = 1;
         private const int WaiterWoken = 2;
-        private const int WaiterCountIncrement = 4;
+        private const int YieldToWaiters = 4;
+        private const int WaiterCountIncrement = 8;
 
-        private const int Uncontended = 0;
+        // state of the lock
+        private AutoResetEvent? _lazyEvent;
+        private int _owningThreadId;
+        private uint _recursionCount;
+        private int _state;
+        private uint _spinLimit = SpinningNotInitialized;
+        private int _wakeWatchDog;
 
-        private volatile int _state;
+        // used to transfer the state when inflating thin locks
+        internal void InitializeLocked(int threadId, int recursionCount)
+        {
+            Debug.Assert(recursionCount == 0 || threadId != 0);
 
-        private uint _recursionCount;
-        private IntPtr _owningThreadId;
-        private volatile AutoResetEvent? _lazyEvent;
+            _state = threadId == 0 ? Uncontended : Locked;
+            _owningThreadId = threadId;
+            _recursionCount = (uint)recursionCount;
+        }
 
         private AutoResetEvent Event
         {
             get
             {
-                //
-                // Can't use LazyInitializer.EnsureInitialized because Lock needs to stay low level enough
-                // for the purposes of lazy generic lookups. LazyInitializer uses a generic delegate.
-                //
                 if (_lazyEvent == null)
                     Interlocked.CompareExchange(ref _lazyEvent, new AutoResetEvent(false), null);
 
@@ -64,27 +101,14 @@ public void Dispose()
             _lazyEvent?.Dispose();
         }
 
-        private static IntPtr CurrentNativeThreadId => (IntPtr)RuntimeImports.RhCurrentNativeThreadId();
+        private static int CurrentThreadId => Environment.CurrentManagedThreadId;
 
-        // On platforms where CurrentNativeThreadId redirects to ManagedThreadId.Current the inlined
-        // version of Lock.Acquire has the ManagedThreadId.Current call not inlined, while the non-inlined
-        // version has it inlined.  So it saves code to keep this function not inlined while having
-        // the same runtime cost.
         [MethodImpl(MethodImplOptions.NoInlining)]
         public void Acquire()
         {
-            IntPtr currentThreadId = CurrentNativeThreadId;
-
-            //
-            // Make one quick attempt to acquire an uncontended lock
-            //
-            if (Interlocked.CompareExchange(ref _state, Locked, Uncontended) == Uncontended)
-            {
-                Debug.Assert(_owningThreadId == IntPtr.Zero);
-                Debug.Assert(_recursionCount == 0);
-                _owningThreadId = currentThreadId;
+            int currentThreadId = CurrentThreadId;
+            if (TryAcquireOneShot(currentThreadId))
                 return;
-            }
 
             //
             // Fall back to the slow path for contention
@@ -103,14 +127,14 @@ public bool TryAcquire(int millisecondsTimeout, bool trackContentions = false)
             if (millisecondsTimeout < -1)
                 throw new ArgumentOutOfRangeException(nameof(millisecondsTimeout), SR.ArgumentOutOfRange_NeedNonNegOrNegative1);
 
-            IntPtr currentThreadId = CurrentNativeThreadId;
+            int currentThreadId = CurrentThreadId;
 
             //
             // Make one quick attempt to acquire an uncontended lock
             //
             if (Interlocked.CompareExchange(ref _state, Locked, Uncontended) == Uncontended)
             {
-                Debug.Assert(_owningThreadId == IntPtr.Zero);
+                Debug.Assert(_owningThreadId == 0);
                 Debug.Assert(_recursionCount == 0);
                 _owningThreadId = currentThreadId;
                 return true;
@@ -122,7 +146,38 @@ public bool TryAcquire(int millisecondsTimeout, bool trackContentions = false)
             return TryAcquireContended(currentThreadId, millisecondsTimeout, trackContentions);
         }
 
-        private bool TryAcquireContended(IntPtr currentThreadId, int millisecondsTimeout, bool trackContentions = false)
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal bool TryAcquireOneShot(int currentThreadId)
+        {
+            //
+            // Make one quick attempt to acquire an uncontended lock
+            //
+            if (Interlocked.CompareExchange(ref _state, Locked, Uncontended) == Uncontended)
+            {
+                Debug.Assert(_owningThreadId == 0);
+                Debug.Assert(_recursionCount == 0);
+                _owningThreadId = currentThreadId;
+                return true;
+            }
+
+            return false;
+        }
+
+        private static unsafe void ExponentialBackoff(uint iteration)
+        {
+            if (iteration > 0)
+            {
+                // no need for much randomness here, we will just hash the stack address + iteration.
+                uint rand = ((uint)&iteration + iteration) * 2654435769u;
+                // set the highmost bit to ensure minimum number of spins is exponentialy increasing
+                // that is in case some stack location results in a sequence of very low spin counts
+                rand |= (1 << 32);
+                uint spins = rand >> (byte)(32 - Math.Min(iteration, MaxExponentialBackoffBits));
+                Thread.SpinWaitInternal((int)spins);
+            }
+        }
+
+        private bool TryAcquireContended(int currentThreadId, int millisecondsTimeout, bool trackContentions = false)
         {
             //
             // If we already own the lock, just increment the recursion count.
@@ -139,115 +194,137 @@ private bool TryAcquireContended(IntPtr currentThreadId, int millisecondsTimeout
             if (millisecondsTimeout == 0)
                 return false;
 
-            int spins = 1;
-
-            if (s_maxSpinCount == SpinningNotInitialized)
+            if (_spinLimit == SpinningNotInitialized)
             {
                 // Use RhGetProcessCpuCount directly to avoid Environment.ProcessorCount->ClassConstructorRunner->Lock->Environment.ProcessorCount cycle
-                s_maxSpinCount = (RuntimeImports.RhGetProcessCpuCount() > 1) ? MaxSpinningValue : SpinningDisabled;
+                if (s_processorCount == 0)
+                    s_processorCount = RuntimeImports.RhGetProcessCpuCount();
+
+                _spinLimit = (s_processorCount > 1) ? MaxSpinLimit : SpinningDisabled;
             }
 
+            bool hasWaited = false;
+            // we will retry after waking up
             while (true)
             {
-                //
-                // Try to grab the lock.  We may take the lock here even if there are existing waiters.  This creates the possibility
-                // of starvation of waiters, but it also prevents lock convoys from destroying perf.
-                // The starvation issue is largely mitigated by the priority boost the OS gives to a waiter when we set
-                // the event, after we release the lock.  Eventually waiters will be boosted high enough to preempt this thread.
-                //
-                int oldState = _state;
-                if ((oldState & Locked) == 0 && Interlocked.CompareExchange(ref _state, oldState | Locked, oldState) == oldState)
-                    goto GotTheLock;
-
-                //
-                // Back off by a factor of 2 for each attempt, up to MaxSpinCount
-                //
-                if (spins <= s_maxSpinCount)
-                {
-                    RuntimeImports.RhSpinWait(spins);
-                    spins *= 2;
-                }
-                else if (oldState != 0)
+                uint iteration = 0;
+                uint localSpinLimit = _spinLimit;
+                // inner loop where we try acquiring the lock or registering as a waiter
+                while (true)
                 {
                     //
-                    // We reached our spin limit, and need to wait.  Increment the waiter count.
-                    // Note that we do not do any overflow checking on this increment.  In order to overflow,
-                    // we'd need to have about 1 billion waiting threads, which is inconceivable anytime in the
-                    // forseeable future.
+                    // Try to grab the lock.  We may take the lock here even if there are existing waiters.  This creates the possibility
+                    // of starvation of waiters, but it also prevents lock convoys and preempted waiters from destroying perf.
+                    // However, if we do not see _wakeWatchDog cleared for long enough, we go into YieldToWaiters mode to ensure some
+                    // waiter progress.
                     //
-                    int newState = (oldState + WaiterCountIncrement) & ~WaiterWoken;
-                    if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState)
-                        break;
-                }
-            }
+                    int oldState = _state;
+                    bool canAcquire = ((oldState & Locked) == 0) &&
+                        (hasWaited || ((oldState & YieldToWaiters) == 0));
 
-            //
-            // Now we wait.
-            //
+                    if (canAcquire)
+                    {
+                        int newState = oldState | Locked;
+                        if (hasWaited)
+                            newState = (newState - WaiterCountIncrement) & ~WaiterWoken & ~YieldToWaiters;
 
-            if (trackContentions)
-            {
-                Monitor.IncrementLockContentionCount();
-            }
+                        if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState)
+                        {
+                            if (hasWaited)
+                                _wakeWatchDog = 0;
+                            else
+                                // if spinning was successful, update spin count
+                                if (iteration < localSpinLimit && localSpinLimit < MaxSpinLimit)
+                                    _spinLimit = localSpinLimit + 1;
+
+                            // GOT THE LOCK!!
+                            Debug.Assert((_state | Locked) != 0);
+                            Debug.Assert(_owningThreadId == 0);
+                            Debug.Assert(_recursionCount == 0);
+                            _owningThreadId = currentThreadId;
+                            return true;
+                        }
+                    }
 
-            TimeoutTracker timeoutTracker = TimeoutTracker.Start(millisecondsTimeout);
-            AutoResetEvent ev = Event;
+                    // spinning was unsuccessful. reduce spin count.
+                    if (iteration == localSpinLimit && localSpinLimit > MinSpinLimit)
+                        _spinLimit = localSpinLimit - 1;
 
-            while (true)
-            {
+                    if (iteration++ < localSpinLimit)
+                    {
+                        Thread.SpinWaitInternal(1);
+                        continue;
+                    }
+                    else if (!canAcquire)
+                    {
+                        //
+                        // We reached our spin limit, and need to wait.  Increment the waiter count.
+                        // Note that we do not do any overflow checking on this increment.  In order to overflow,
+                        // we'd need to have about 1 billion waiting threads, which is inconceivable anytime in the
+                        // forseeable future.
+                        //
+                        int newState = oldState + WaiterCountIncrement;
+                        if (hasWaited)
+                            newState = (newState - WaiterCountIncrement) & ~WaiterWoken;
+
+                        if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState)
+                            break;
+                    }
+
+                    Debug.Assert(iteration >= localSpinLimit);
+                    ExponentialBackoff(iteration - localSpinLimit);
+                }
+
+                //
+                // Now we wait.
+                //
+
+                if (trackContentions)
+                {
+                    Monitor.IncrementLockContentionCount();
+                }
+
+                TimeoutTracker timeoutTracker = TimeoutTracker.Start(millisecondsTimeout);
+                Debug.Assert(_state >= WaiterCountIncrement);
+                bool waitSucceeded = Event.WaitOne(millisecondsTimeout);
                 Debug.Assert(_state >= WaiterCountIncrement);
 
-                bool waitSucceeded = ev.WaitOne(timeoutTracker.Remaining);
+                if (!waitSucceeded)
+                    break;
+
+                // we did not time out and will try acquiring the lock
+                hasWaited = true;
+                millisecondsTimeout = timeoutTracker.Remaining;
+            }
 
+            // We timed out.  We're not going to wait again.
+            {
+                uint iteration = 0;
                 while (true)
                 {
                     int oldState = _state;
                     Debug.Assert(oldState >= WaiterCountIncrement);
 
-                    // Clear the "waiter woken" bit.
-                    int newState = oldState & ~WaiterWoken;
+                    int newState = oldState - WaiterCountIncrement;
 
-                    if ((oldState & Locked) == 0)
-                    {
-                        // The lock is available, try to get it.
-                        newState |= Locked;
-                        newState -= WaiterCountIncrement;
+                    // We could not have consumed a wake, or the wait would've succeeded.
+                    // If we are the last waiter though, we will clear WaiterWoken and YieldToWaiters
+                    // just so that lock would not look like contended.
+                    if (newState < WaiterCountIncrement)
+                        newState = newState & ~WaiterWoken & ~YieldToWaiters;
 
-                        if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState)
-                            goto GotTheLock;
-                    }
-                    else if (!waitSucceeded)
-                    {
-                        // The lock is not available, and we timed out.  We're not going to wait agin.
-                        newState -= WaiterCountIncrement;
+                    if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState)
+                        return false;
 
-                        if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState)
-                            return false;
-                    }
-                    else
-                    {
-                        // The lock is not available, and we didn't time out.  We're going to wait again.
-                        if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState)
-                            break;
-                    }
+                    ExponentialBackoff(iteration++);
                 }
             }
-
-        GotTheLock:
-            Debug.Assert((_state | Locked) != 0);
-            Debug.Assert(_owningThreadId == IntPtr.Zero);
-            Debug.Assert(_recursionCount == 0);
-            _owningThreadId = currentThreadId;
-            return true;
         }
 
         public bool IsAcquired
         {
             get
             {
-                //
-                // The comment below is for platforms where CurrentNativeThreadId redirects to
-                // ManagedThreadId.Current instead of being a compiler intrinsic.
                 //
                 // Compare the current owning thread ID with the current thread ID.  We need
                 // to read the current thread's ID before we read m_owningThreadId.  Otherwise,
@@ -264,24 +341,38 @@ public bool IsAcquired
                 // because while we're doing this check the current thread is definitely still
                 // alive.
                 //
-                IntPtr currentThreadId = CurrentNativeThreadId;
-                bool acquired = (currentThreadId == _owningThreadId);
-                if (acquired)
-                    Debug.Assert((_state & Locked) != 0);
-                return acquired;
+                int currentThreadId = CurrentThreadId;
+                return IsAcquiredByThread(currentThreadId);
             }
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal bool IsAcquiredByThread(int currentThreadId)
+        {
+            bool acquired = (currentThreadId == _owningThreadId);
+            Debug.Assert(!acquired || (_state & Locked) != 0);
+            return acquired;
+        }
+
         [MethodImpl(MethodImplOptions.NoInlining)]
         public void Release()
         {
-            if (!IsAcquired)
+            ReleaseByThread(CurrentThreadId);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal void ReleaseByThread(int threadId)
+        {
+            if (threadId != _owningThreadId)
                 throw new SynchronizationLockException();
 
-            if (_recursionCount > 0)
-                _recursionCount--;
-            else
+            if (_recursionCount == 0)
+            {
                 ReleaseCore();
+                return;
+            }
+
+            _recursionCount--;
         }
 
         internal uint ReleaseAll()
@@ -300,7 +391,7 @@ internal uint ReleaseAll()
         private void ReleaseCore()
         {
             Debug.Assert(_recursionCount == 0);
-            _owningThreadId = IntPtr.Zero;
+            _owningThreadId = 0;
 
             //
             // Make one quick attempt to release an uncontended lock
@@ -317,8 +408,9 @@ private void ReleaseCore()
         private void ReleaseContended()
         {
             Debug.Assert(_recursionCount == 0);
-            Debug.Assert(_owningThreadId == IntPtr.Zero);
+            Debug.Assert(_owningThreadId == 0);
 
+            uint iteration = 0;
             while (true)
             {
                 int oldState = _state;
@@ -330,8 +422,21 @@ private void ReleaseContended()
                 {
                     // there are waiters, and nobody has woken one.
                     newState |= WaiterWoken;
+
+                    int lastWakeTicks = _wakeWatchDog;
+                    if (lastWakeTicks != 0 && Environment.TickCount - lastWakeTicks > 100)
+                    {
+                        newState |= YieldToWaiters;
+                    }
+
                     if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState)
                     {
+                        if (lastWakeTicks == 0)
+                        {
+                            // nonzero timestamp of the last wake
+                            _wakeWatchDog = Environment.TickCount | 1;
+                        }
+
                         Event.Set();
                         return;
                     }
@@ -342,6 +447,8 @@ private void ReleaseContended()
                     if (Interlocked.CompareExchange(ref _state, newState, oldState) == oldState)
                         return;
                 }
+
+                ExponentialBackoff(iteration++);
             }
         }
 
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ManagedThreadId.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ManagedThreadId.cs
index 08872af88871f..38672d2348e5a 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ManagedThreadId.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ManagedThreadId.cs
@@ -246,6 +246,8 @@ public static void RecycleId(int id)
             }
         }
 
+        internal static int CurrentManagedThreadIdUnchecked => t_currentManagedThreadId;
+
         public static int Current
         {
             get
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Monitor.NativeAot.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Monitor.NativeAot.cs
index f5317ff6d22ed..087745d3b71aa 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Monitor.NativeAot.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Monitor.NativeAot.cs
@@ -24,19 +24,8 @@ public static partial class Monitor
     {
         #region Object->Lock/Condition mapping
 
-        private static ConditionalWeakTable<object, Condition> s_conditionTable = new ConditionalWeakTable<object, Condition>();
-        private static ConditionalWeakTable<object, Condition>.CreateValueCallback s_createCondition = (o) => new Condition(GetLock(o));
-
-        internal static Lock GetLock(object obj)
-        {
-            if (obj == null)
-                throw new ArgumentNullException(nameof(obj));
-
-            Debug.Assert(!(obj is Lock),
-                "Do not use Monitor.Enter or TryEnter on a Lock instance; use Lock methods directly instead.");
-
-            return ObjectHeader.GetLockObject(obj);
-        }
+        private static readonly ConditionalWeakTable<object, Condition> s_conditionTable = new ConditionalWeakTable<object, Condition>();
+        private static readonly ConditionalWeakTable<object, Condition>.CreateValueCallback s_createCondition = (o) => new Condition(ObjectHeader.GetLockObject(o));
 
         private static Condition GetCondition(object obj)
         {
@@ -49,77 +38,98 @@ private static Condition GetCondition(object obj)
 
         #region Public Enter/Exit methods
 
+        [MethodImpl(MethodImplOptions.NoInlining)]
         public static void Enter(object obj)
         {
-            Lock lck = GetLock(obj);
+            int resultOrIndex = ObjectHeader.Acquire(obj);
+            if (resultOrIndex < 0)
+                return;
+
+            Lock lck = resultOrIndex == 0 ?
+                ObjectHeader.GetLockObject(obj) :
+                SyncTable.GetLockObject(resultOrIndex);
+
             if (lck.TryAcquire(0))
                 return;
+
             TryAcquireContended(lck, obj, Timeout.Infinite);
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static void Enter(object obj, ref bool lockTaken)
         {
+            // we are inlining lockTaken check as the check is likely be optimized away
             if (lockTaken)
                 throw new ArgumentException(SR.Argument_MustBeFalse, nameof(lockTaken));
 
-            Lock lck = GetLock(obj);
-            if (lck.TryAcquire(0))
-            {
-                lockTaken = true;
-                return;
-            }
-            TryAcquireContended(lck, obj, Timeout.Infinite);
+            Enter(obj);
             lockTaken = true;
         }
 
+        [MethodImpl(MethodImplOptions.NoInlining)]
         public static bool TryEnter(object obj)
         {
-            return GetLock(obj).TryAcquire(0);
+            int resultOrIndex = ObjectHeader.TryAcquire(obj);
+            if (resultOrIndex < 0)
+                return true;
+
+            if (resultOrIndex == 0)
+                return false;
+
+            Lock lck = SyncTable.GetLockObject(resultOrIndex);
+            return lck.TryAcquire(0);
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static void TryEnter(object obj, ref bool lockTaken)
         {
+            // we are inlining lockTaken check as the check is likely be optimized away
             if (lockTaken)
                 throw new ArgumentException(SR.Argument_MustBeFalse, nameof(lockTaken));
 
-            lockTaken = GetLock(obj).TryAcquire(0);
+            lockTaken = TryEnter(obj);
         }
 
+        [MethodImpl(MethodImplOptions.NoInlining)]
         public static bool TryEnter(object obj, int millisecondsTimeout)
         {
             if (millisecondsTimeout < -1)
                 throw new ArgumentOutOfRangeException(nameof(millisecondsTimeout), SR.ArgumentOutOfRange_NeedNonNegOrNegative1);
 
-            Lock lck = GetLock(obj);
+            int resultOrIndex = ObjectHeader.TryAcquire(obj);
+            if (resultOrIndex < 0)
+                return true;
+
+            Lock lck = resultOrIndex == 0 ?
+                ObjectHeader.GetLockObject(obj) :
+                SyncTable.GetLockObject(resultOrIndex);
+
             if (lck.TryAcquire(0))
                 return true;
+
             return TryAcquireContended(lck, obj, millisecondsTimeout);
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static void TryEnter(object obj, int millisecondsTimeout, ref bool lockTaken)
         {
+            // we are inlining lockTaken check as the check is likely be optimized away
             if (lockTaken)
                 throw new ArgumentException(SR.Argument_MustBeFalse, nameof(lockTaken));
-            if (millisecondsTimeout < -1)
-                throw new ArgumentOutOfRangeException(nameof(millisecondsTimeout), SR.ArgumentOutOfRange_NeedNonNegOrNegative1);
 
-            Lock lck = GetLock(obj);
-            if (lck.TryAcquire(0))
-            {
-                lockTaken = true;
-                return;
-            }
-            lockTaken = TryAcquireContended(lck, obj, millisecondsTimeout);
+            lockTaken = TryEnter(obj, millisecondsTimeout);
         }
 
+        [MethodImpl(MethodImplOptions.NoInlining)]
         public static void Exit(object obj)
         {
-            GetLock(obj).Release();
+            ObjectHeader.Release(obj);
         }
 
+        [MethodImpl(MethodImplOptions.NoInlining)]
         public static bool IsEntered(object obj)
         {
-            return GetLock(obj).IsAcquired;
+            return ObjectHeader.IsAcquired(obj);
         }
 
         #endregion
@@ -192,20 +202,19 @@ private enum DebugBlockingItemType
         }
 
         // Represents an item a thread is blocked on. This structure is allocated on the stack and accessed by the debugger.
-        // Fields are volatile to avoid potential compiler optimizations.
         private struct DebugBlockingItem
         {
             // The object the thread is waiting on
-            public volatile object _object;
+            public object _object;
 
             // Indicates how the thread is blocked on the item
-            public volatile DebugBlockingItemType _blockingType;
+            public DebugBlockingItemType _blockingType;
 
             // Blocking timeout in milliseconds or Timeout.Infinite for no timeout
-            public volatile int _timeout;
+            public int _timeout;
 
             // Next pointer in the linked list of DebugBlockingItem records
-            public volatile IntPtr _next;
+            public IntPtr _next;
         }
 
         private unsafe struct DebugBlockingScope : IDisposable
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ObjectHeader.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ObjectHeader.cs
index c9c5d5c0bc1cf..0de12219eb913 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ObjectHeader.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/ObjectHeader.cs
@@ -1,6 +1,7 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using Internal.Runtime;
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
 
@@ -16,39 +17,36 @@ namespace System.Threading
     /// </remarks>
     internal static class ObjectHeader
     {
-        // The following two header bits are used by the GC engine:
+        // The following three header bits are reserved for the GC engine:
+        //   BIT_SBLK_UNUSED        = 0x80000000
         //   BIT_SBLK_FINALIZER_RUN = 0x40000000
         //   BIT_SBLK_GC_RESERVE    = 0x20000000
         //
         // All other bits may be used to store runtime data: hash code, sync entry index, etc.
         // Here we use the same bit layout as in CLR: if bit 26 (BIT_SBLK_IS_HASHCODE) is set,
         // all the lower bits 0..25 store the hash code, otherwise they store either the sync
-        // entry index or all zero.
-        //
-        // If needed, the MASK_HASHCODE_INDEX bit mask may be made wider or narrower than the
-        // current 26 bits; the BIT_SBLK_IS_HASHCODE bit is not required to be adjacent to the
-        // mask.  The code only assumes that MASK_HASHCODE_INDEX occupies the lowest bits of the
-        // header (i.e. ends with bit 0) and that (MASK_HASHCODE_INDEX + 1) does not overflow
-        // the Int32 type (i.e. the mask may be no longer than 30 bits).
-
+        // entry index (indicated by BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) or thin lock data.
         private const int IS_HASHCODE_BIT_NUMBER = 26;
+        private const int IS_HASH_OR_SYNCBLKINDEX_BIT_NUMBER = 27;
         private const int BIT_SBLK_IS_HASHCODE = 1 << IS_HASHCODE_BIT_NUMBER;
         internal const int MASK_HASHCODE_INDEX = BIT_SBLK_IS_HASHCODE - 1;
+        private const int BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX = 1 << IS_HASH_OR_SYNCBLKINDEX_BIT_NUMBER;
+
+        // if BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX is clear, the rest of the header dword is laid out as follows:
+        // - lower sixteen bits (bits 0 thru 15) is thread id used for the thin locks
+        //   value is zero if no thread is holding the lock
+        // - following six bits (bits 16 thru 21) is recursion level used for the thin locks
+        //   value is zero if lock is not taken or only taken once by the same thread
+        private const int SBLK_MASK_LOCK_THREADID = 0x0000FFFF;   // special value of 0 + 65535 thread ids
+        private const int SBLK_MASK_LOCK_RECLEVEL = 0x003F0000;   // 64 recursion levels
+        private const int SBLK_LOCK_RECLEVEL_INC = 0x00010000;    // each level is this much higher than the previous one
+        private const int SBLK_RECLEVEL_SHIFT = 16;               // shift right this much to get recursion level
 
-#if TARGET_ARM || TARGET_ARM64
-        [MethodImpl(MethodImplOptions.NoInlining)]
-#else
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-#endif
-        public static unsafe int ReadVolatileMemory(int* pHeader)
+        private static unsafe int* GetHeaderPtr(MethodTable** ppMethodTable)
         {
-            // While in x86/amd64 Volatile.Read is cheap, in arm we have to pay the
-            // cost of a barrier. We do no inlining to get around that.
-#if TARGET_ARM || TARGET_ARM64
-            return *pHeader;
-#else
-            return Volatile.Read(ref *pHeader);
-#endif
+            // The header is 4 bytes before m_pEEType field on all architectures
+            return (int*)ppMethodTable - 1;
         }
 
         /// <summary>
@@ -58,16 +56,12 @@ public static unsafe int ReadVolatileMemory(int* pHeader)
         public static unsafe int GetHashCode(object o)
         {
             if (o == null)
-            {
                 return 0;
-            }
 
-            fixed (byte* pRawData = &o.GetRawData())
+            fixed (MethodTable** ppMethodTable = &o.GetMethodTableRef())
             {
-                // The header is 4 bytes before m_pEEType field on all architectures
-                int* pHeader = (int*)(pRawData - sizeof(IntPtr) - sizeof(int));
-
-                int bits = ReadVolatileMemory(pHeader);
+                int* pHeader = GetHeaderPtr(ppMethodTable);
+                int bits = *pHeader;
                 int hashOrIndex = bits & MASK_HASHCODE_INDEX;
                 if ((bits & BIT_SBLK_IS_HASHCODE) != 0)
                 {
@@ -75,7 +69,8 @@ public static unsafe int GetHashCode(object o)
                     Debug.Assert(hashOrIndex != 0);
                     return hashOrIndex;
                 }
-                if (hashOrIndex != 0)
+
+                if ((bits & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) != 0)
                 {
                     // Look up the hash code in the SyncTable
                     int hashCode = SyncTable.GetHashCode(hashOrIndex);
@@ -84,19 +79,18 @@ public static unsafe int GetHashCode(object o)
                         return hashCode;
                     }
                 }
+
                 // The hash code has not yet been set.  Assign some value.
-                return AssignHashCode(pHeader);
+                return AssignHashCode(o, pHeader);
             }
         }
 
         /// <summary>
         /// Assigns a hash code to the object in a thread-safe way.
         /// </summary>
-        private static unsafe int AssignHashCode(int* pHeader)
+        private static unsafe int AssignHashCode(object o, int* pHeader)
         {
             int newHash = RuntimeHelpers.GetNewHashCode() & MASK_HASHCODE_INDEX;
-            int bitAndValue;
-
             // Never use the zero hash code.  SyncTable treats the zero value as "not assigned".
             if (newHash == 0)
             {
@@ -105,73 +99,82 @@ private static unsafe int AssignHashCode(int* pHeader)
 
             while (true)
             {
-                int oldBits = Volatile.Read(ref *pHeader);
-                bitAndValue = oldBits & (BIT_SBLK_IS_HASHCODE | MASK_HASHCODE_INDEX);
-                if (bitAndValue != 0)
+                int oldBits = *pHeader;
+
+                // if have hashcode, just return it
+                if ((oldBits & BIT_SBLK_IS_HASHCODE) != 0)
+                {
+                    // Found the hash code in the header
+                    int h = oldBits & MASK_HASHCODE_INDEX;
+                    Debug.Assert(h != 0);
+                    return h;
+                }
+
+                // if have something else, break, we need a syncblock.
+                if ((oldBits & MASK_HASHCODE_INDEX) != 0)
                 {
-                    // The header already stores some value
                     break;
                 }
 
-                // The header stores nothing.  Try to store the hash code.
-                int newBits = oldBits | BIT_SBLK_IS_HASHCODE | newHash;
+                // there is nothing - try set hashcode inline
+                Debug.Assert((oldBits & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) == 0);
+                int newBits = BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX | BIT_SBLK_IS_HASHCODE | oldBits | newHash;
                 if (Interlocked.CompareExchange(ref *pHeader, newBits, oldBits) == oldBits)
                 {
                     return newHash;
                 }
 
-                // Another thread modified the header; try again
+                // contention, try again
             }
 
-            if ((bitAndValue & BIT_SBLK_IS_HASHCODE) == 0)
+            if (!GetSyncEntryIndex(*pHeader, out int syncIndex))
             {
-                // Set the hash code in SyncTable.  This call will resolve the potential race.
-                return SyncTable.SetHashCode(bitAndValue, newHash);
+                // Assign a new sync entry
+                syncIndex = SyncTable.AssignEntry(o, pHeader);
             }
 
-            // Another thread set the hash code, use it
-            Debug.Assert((bitAndValue & ~BIT_SBLK_IS_HASHCODE) != 0);
-            return bitAndValue & ~BIT_SBLK_IS_HASHCODE;
+            // Set the hash code in SyncTable. This call will resolve the potential race.
+            return SyncTable.SetHashCode(syncIndex, newHash);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static bool HasSyncEntryIndex(int header)
+        {
+            return (header & (BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX | BIT_SBLK_IS_HASHCODE)) == BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX;
         }
 
         /// <summary>
         /// Extracts the sync entry index or the hash code from the header value.  Returns true
         /// if the header value stores the sync entry index.
         /// </summary>
-        // Inlining is important for lock performance
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static bool GetSyncEntryIndex(int header, out int hashOrIndex)
+        public static bool GetSyncEntryIndex(int header, out int index)
         {
-            hashOrIndex = header & MASK_HASHCODE_INDEX;
-            // The following is equivalent to:
-            //   return (hashOrIndex != 0) && ((header & BIT_SBLK_IS_HASHCODE) == 0);
-            // Shifting the BIT_SBLK_IS_HASHCODE bit to the sign bit saves one branch.
-            int bitAndValue = header & (BIT_SBLK_IS_HASHCODE | MASK_HASHCODE_INDEX);
-            return (bitAndValue << (31 - IS_HASHCODE_BIT_NUMBER)) > 0;
+            index = header & MASK_HASHCODE_INDEX;
+            return HasSyncEntryIndex(header);
         }
 
         /// <summary>
         /// Returns the Monitor synchronization object assigned to this object.  If no synchronization
         /// object has yet been assigned, it assigns one in a thread-safe way.
         /// </summary>
-        // Called from Monitor.Enter only; inlining is important for lock performance
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static unsafe Lock GetLockObject(object o)
         {
-            fixed (byte* pRawData = &o.GetRawData())
-            {
-                // The header is 4 bytes before m_pEEType field on all architectures
-                int* pHeader = (int*)(pRawData - sizeof(IntPtr) - sizeof(int));
+            return SyncTable.GetLockObject(GetSyncIndex(o));
+        }
 
-                if (GetSyncEntryIndex(ReadVolatileMemory(pHeader), out int hashOrIndex))
+        private static unsafe int GetSyncIndex(object o)
+        {
+            fixed (MethodTable** ppMethodTable = &o.GetMethodTableRef())
+            {
+                int* pHeader = GetHeaderPtr(ppMethodTable);
+                if (GetSyncEntryIndex(*pHeader, out int syncIndex))
                 {
-                    // Already have a sync entry for this object, return the synchronization object
-                    // stored in the entry.
-                    return SyncTable.GetLockObject(hashOrIndex);
+                    return syncIndex;
                 }
+
                 // Assign a new sync entry
-                int syncIndex = SyncTable.AssignEntry(o, pHeader);
-                return SyncTable.GetLockObject(syncIndex);
+                return SyncTable.AssignEntry(o, pHeader);
             }
         }
 
@@ -183,32 +186,304 @@ public static unsafe void SetSyncEntryIndex(int* pHeader, int syncIndex)
             // Holding this lock implies there is at most one thread setting the sync entry index at
             // any given time.  We also require that the sync entry index has not been already set.
             Debug.Assert(SyncTable.s_lock.IsAcquired);
-            Debug.Assert((syncIndex & MASK_HASHCODE_INDEX) == syncIndex);
-            int oldBits, newBits, hashOrIndex;
+            int oldBits, newBits;
 
             do
             {
-                oldBits = Volatile.Read(ref *pHeader);
-                newBits = oldBits;
+                oldBits = *pHeader;
+                // we should not have a sync index yet.
+                Debug.Assert(!HasSyncEntryIndex(oldBits));
 
-                if (GetSyncEntryIndex(oldBits, out hashOrIndex))
+                if ((oldBits & BIT_SBLK_IS_HASHCODE) != 0)
                 {
-                    // Must not get here; see the contract
-                    throw new InvalidOperationException();
+                    // set the hash code in the sync entry
+                    SyncTable.MoveHashCodeToNewEntry(syncIndex, oldBits & MASK_HASHCODE_INDEX);
+                    // reset the lock info, in case we have set it in the previous iteration
+                    SyncTable.MoveThinLockToNewEntry(syncIndex, 0, 0);
                 }
-
-                Debug.Assert(((oldBits & BIT_SBLK_IS_HASHCODE) == 0) || (hashOrIndex != 0));
-                if (hashOrIndex != 0)
+                else
                 {
-                    // Move the hash code to the sync entry
-                    SyncTable.MoveHashCodeToNewEntry(syncIndex, hashOrIndex);
+                    // set the lock info
+                    SyncTable.MoveThinLockToNewEntry(
+                        syncIndex,
+                        oldBits & SBLK_MASK_LOCK_THREADID,
+                        (oldBits & SBLK_MASK_LOCK_RECLEVEL) >> SBLK_RECLEVEL_SHIFT);
                 }
 
                 // Store the sync entry index
-                newBits &= ~(BIT_SBLK_IS_HASHCODE | MASK_HASHCODE_INDEX);
-                newBits |= syncIndex;
+                newBits = oldBits & ~(BIT_SBLK_IS_HASHCODE | MASK_HASHCODE_INDEX);
+                newBits |= syncIndex | BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX;
             }
             while (Interlocked.CompareExchange(ref *pHeader, newBits, oldBits) != oldBits);
         }
+
+        //
+        // A few words about spinning choices:
+        //
+        // Most locks are not contentious. In fact most locks provide exclusive access safety, but in reality are used by
+        // one thread. And then there are locks that do see multiple threads, but the accesses are short and not overlapping.
+        // Thin lock is an optimization for such scenarios.
+        //
+        // If we see a thin lock held by other thread for longer than ~5 microseconds, we will "inflate" the lock
+        // and let the adaptive spinning in the fat Lock sort it out whether we have a contentious lock or a long-held lock.
+        //
+        // Another thing to consider is that SpinWait(1) is calibrated to about 35-50 nanoseconds.
+        // It can take much longer only if nop/pause takes much longer, which it should not, as that would be getting
+        // close to the RAM latency.
+        //
+        // Considering that taking and releasing the lock takes 2 CAS instructions + some overhead, we can estimate shortest
+        // time the lock can be held to be in hundreds of nanoseconds. Thus it is unlikely to see more than
+        // 8-10 threads contending for the lock without inflating it. Therefore we can expect to acquire a thin lock in
+        // under 16 tries.
+        //
+        // As for the backoff strategy we have two choices:
+        // Exponential back-off with a lmit:
+        //   0, 1, 2, 4, 8, 8, 8, 8, 8, 8, 8, . . . .
+        //
+        // Linear back-off
+        //   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, . . . .
+        //
+        // In this case these strategies are close in terms of average and worst case latency, so we will prefer linear
+        // back-off as it favors micro-contention scenario, which we expect.
+        //
+
+        // Returs:
+        // -1 - success
+        // 0 - failed
+        // syncIndex - retry with the Lock
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe int Acquire(object obj)
+        {
+            return TryAcquire(obj, oneShot: false);
+        }
+
+        // -1 - success
+        // 0 - failed
+        // syncIndex - retry with the Lock
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe int TryAcquire(object obj, bool oneShot = true)
+        {
+            ArgumentNullException.ThrowIfNull(obj);
+
+            Debug.Assert(!(obj is Lock),
+                "Do not use Monitor.Enter or TryEnter on a Lock instance; use Lock methods directly instead.");
+
+            int currentThreadID = ManagedThreadId.CurrentManagedThreadIdUnchecked;
+
+            // if thread ID is uninitialized or too big, we do "uncommon" part.
+            if ((uint)(currentThreadID - 1) <= (uint)SBLK_MASK_LOCK_THREADID)
+            {
+                // for an object used in locking there are two common cases:
+                // - header bits are unused or
+                // - there is a sync entry
+                fixed (MethodTable** ppMethodTable = &obj.GetMethodTableRef())
+                {
+                    int* pHeader = GetHeaderPtr(ppMethodTable);
+                    int oldBits = *pHeader;
+                    // if unused for anything, try setting our thread id
+                    // N.B. hashcode, thread ID and sync index are never 0, and hashcode is largest of all
+                    if ((oldBits & MASK_HASHCODE_INDEX) == 0)
+                    {
+                        if (Interlocked.CompareExchange(ref *pHeader, oldBits | currentThreadID, oldBits) == oldBits)
+                        {
+                            return -1;
+                        }
+                    }
+                    else if (GetSyncEntryIndex(oldBits, out int syncIndex))
+                    {
+                        if (SyncTable.GetLockObject(syncIndex).TryAcquireOneShot(currentThreadID))
+                        {
+                            return -1;
+                        }
+
+                        // has sync entry -> slow path
+                        return syncIndex;
+                    }
+                }
+            }
+
+            return TryAcquireUncommon(obj, oneShot);
+        }
+
+        // handling uncommon cases here - recursive lock, contention, retries
+        // -1 - success
+        // 0 - failed
+        // syncIndex - retry with the Lock
+        private static unsafe int TryAcquireUncommon(object obj, bool oneShot)
+        {
+            // does thread ID fit?
+            int currentThreadID = Environment.CurrentManagedThreadId;
+            if (currentThreadID > SBLK_MASK_LOCK_THREADID)
+                return GetSyncIndex(obj);
+
+            // Lock.s_processorCount is lazy-initialized at fist contended acquire
+            // untill then it is 0 and we assume we have multicore machine
+            int retries = oneShot || Lock.s_processorCount == 1 ? 0 : 16;
+
+            // retry when the lock is owned by somebody else.
+            // this loop will spinwait between iterations.
+            for (int i = 0; i <= retries; i++)
+            {
+                fixed (MethodTable** ppMethodTable = &obj.GetMethodTableRef())
+                {
+                    int* pHeader = GetHeaderPtr(ppMethodTable);
+
+                    // rare retries when lock is not owned by somebody else.
+                    // these do not count as iterations and do not spinwait.
+                    while (true)
+                    {
+                        int oldBits = *pHeader;
+
+                        // if unused for anything, try setting our thread id
+                        // N.B. hashcode, thread ID and sync index are never 0, and hashcode is largest of all
+                        if ((oldBits & MASK_HASHCODE_INDEX) == 0)
+                        {
+                            int newBits = oldBits | currentThreadID;
+                            if (Interlocked.CompareExchange(ref *pHeader, newBits, oldBits) == oldBits)
+                            {
+                                return -1;
+                            }
+
+                            // contention on a lock that noone owned,
+                            // but we do not know if there is an owner yet, so try again
+                            continue;
+                        }
+
+                        // has sync entry -> slow path
+                        if (GetSyncEntryIndex(oldBits, out int syncIndex))
+                        {
+                            return syncIndex;
+                        }
+
+                        // has hashcode -> slow path
+                        if ((oldBits & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) != 0)
+                        {
+                            return SyncTable.AssignEntry(obj, pHeader);
+                        }
+
+                        // we own the lock already
+                        if ((oldBits & SBLK_MASK_LOCK_THREADID) == currentThreadID)
+                        {
+                            // try incrementing recursion level, check for overflow
+                            int newBits = oldBits + SBLK_LOCK_RECLEVEL_INC;
+                            if ((newBits & SBLK_MASK_LOCK_RECLEVEL) != 0)
+                            {
+                                if (Interlocked.CompareExchange(ref *pHeader, newBits, oldBits) == oldBits)
+                                {
+                                    return -1;
+                                }
+
+                                // rare contention on owned lock,
+                                // perhaps hashcode was installed or finalization bits were touched.
+                                // we still own the lock though and may be able to increment, try again
+                                continue;
+                            }
+                            else
+                            {
+                                // overflow, transition to a fat Lock
+                                return SyncTable.AssignEntry(obj, pHeader);
+                            }
+                        }
+
+                        // someone else owns.
+                        break;
+                    }
+                }
+
+                // spin a bit before retrying (1 spinwait is roughly 35 nsec)
+                // the object is not pinned here
+                Thread.SpinWaitInternal(i);
+            }
+
+            // owned by somebody else
+            return 0;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void Release(object obj)
+        {
+            ArgumentNullException.ThrowIfNull(obj);
+
+            Debug.Assert(!(obj is Lock),
+                "Do not use Monitor.Enter or TryEnter on a Lock instance; use Lock methods directly instead.");
+
+            int currentThreadID = ManagedThreadId.CurrentManagedThreadIdUnchecked;
+            // transform uninitialized ID into -1, so it will not match any possible lock owner
+            currentThreadID |= (currentThreadID - 1) >> 31;
+
+            Lock fatLock;
+            fixed (MethodTable** ppMethodTable = &obj.GetMethodTableRef())
+            {
+                int* pHeader = GetHeaderPtr(ppMethodTable);
+                while (true)
+                {
+                    int oldBits = *pHeader;
+
+                    // if we own the lock
+                    if ((oldBits & SBLK_MASK_LOCK_THREADID) == currentThreadID &&
+                        (oldBits & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) == 0)
+                    {
+                        // decrement count or release entirely.
+                        int newBits = (oldBits & SBLK_MASK_LOCK_RECLEVEL) != 0 ?
+                            oldBits - SBLK_LOCK_RECLEVEL_INC :
+                            oldBits & ~SBLK_MASK_LOCK_THREADID;
+
+                        if (Interlocked.CompareExchange(ref *pHeader, newBits, oldBits) == oldBits)
+                        {
+                            return;
+                        }
+
+                        // rare contention on owned lock,
+                        // we still own the lock, try again
+                        continue;
+                    }
+
+                    if (!GetSyncEntryIndex(oldBits, out int syncIndex))
+                    {
+                        // someone else owns or noone.
+                        throw new SynchronizationLockException();
+                    }
+
+                    fatLock = SyncTable.GetLockObject(syncIndex);
+                    break;
+                }
+            }
+
+            fatLock.ReleaseByThread(currentThreadID);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe bool IsAcquired(object obj)
+        {
+            ArgumentNullException.ThrowIfNull(obj);
+
+            Debug.Assert(!(obj is Lock),
+                "Do not use Monitor.Enter or TryEnter on a Lock instance; use Lock methods directly instead.");
+
+            int currentThreadID = ManagedThreadId.CurrentManagedThreadIdUnchecked;
+            // transform uninitialized ID into -1, so it will not match any possible lock owner
+            currentThreadID |= (currentThreadID - 1) >> 31;
+
+            fixed (MethodTable** ppMethodTable = &obj.GetMethodTableRef())
+            {
+                int* pHeader = GetHeaderPtr(ppMethodTable);
+                int oldBits = *pHeader;
+
+                // if we own the lock
+                if ((oldBits & SBLK_MASK_LOCK_THREADID) == currentThreadID &&
+                   (oldBits & BIT_SBLK_IS_HASH_OR_SYNCBLKINDEX) == 0)
+                {
+                    return true;
+                }
+
+                if (GetSyncEntryIndex(oldBits, out int syncIndex))
+                {
+                    return SyncTable.GetLockObject(syncIndex).IsAcquiredByThread(currentThreadID);
+                }
+
+                // someone else owns or noone.
+                return false;
+            }
+        }
     }
 }
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/SyncTable.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/SyncTable.cs
index e531323d62a81..de07c986834ec 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/SyncTable.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/SyncTable.cs
@@ -4,6 +4,7 @@
 using System.Diagnostics;
 using System.Runtime;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 
 namespace System.Threading
 {
@@ -62,9 +63,9 @@ internal static class SyncTable
         private const int DoublingSizeThreshold = 1 << 20;
 
         /// <summary>
-        /// Protects all mutable operations on s_entrie, s_freeEntryList, s_unusedEntryIndex. Also protects growing the table.
+        /// Protects all mutable operations on s_entries, s_freeEntryList, s_unusedEntryIndex. Also protects growing the table.
         /// </summary>
-        internal static Lock s_lock = new Lock();
+        internal static readonly Lock s_lock = new Lock();
 
         /// <summary>
         /// The dynamically growing array of sync entries.
@@ -83,6 +84,16 @@ internal static class SyncTable
         /// </summary>
         private static int s_unusedEntryIndex = 1;
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static ref Entry UnsafeEntryRef(int i)
+        {
+#if DEBUG
+            return ref s_entries[i];
+#else
+            return ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(s_entries), i);
+#endif
+        }
+
         /// <summary>
         /// Assigns a sync table entry to the object in a thread-safe way.
         /// </summary>
@@ -98,12 +109,11 @@ public static unsafe int AssignEntry(object obj, int* pHeader)
                 using (LockHolder.Hold(s_lock))
                 {
                     // After acquiring the lock check whether another thread already assigned the sync entry
-                    if (ObjectHeader.GetSyncEntryIndex(*pHeader, out int hashOrIndex))
+                    if (ObjectHeader.GetSyncEntryIndex(*pHeader, out int syncIndex))
                     {
-                        return hashOrIndex;
+                        return syncIndex;
                     }
 
-                    int syncIndex;
                     if (s_freeEntryList != 0)
                     {
                         // Grab a free entry from the list
@@ -219,7 +229,7 @@ public static int GetHashCode(int syncIndex)
             // This thread may be looking at an old version of s_entries.  If the old version had
             // no hash code stored, GetHashCode returns zero and the subsequent SetHashCode call
             // will resolve the potential race.
-            return s_entries[syncIndex].HashCode;
+            return UnsafeEntryRef(syncIndex).HashCode;
         }
 
         /// <summary>
@@ -255,6 +265,18 @@ public static void MoveHashCodeToNewEntry(int syncIndex, int hashCode)
             s_entries[syncIndex].HashCode = hashCode;
         }
 
+        /// <summary>
+        /// Initializes the Lock assuming the caller holds s_lock.  Use for not yet
+        /// published entries only.
+        /// </summary>
+        public static void MoveThinLockToNewEntry(int syncIndex, int threadId, int recursionLevel)
+        {
+            Debug.Assert(s_lock.IsAcquired);
+            Debug.Assert((0 < syncIndex) && (syncIndex < s_unusedEntryIndex));
+
+            s_entries[syncIndex].Lock.InitializeLocked(threadId, recursionLevel);
+        }
+
         /// <summary>
         /// Returns the Monitor synchronization object.  The return value is never null.
         /// </summary>
@@ -262,7 +284,7 @@ public static Lock GetLockObject(int syncIndex)
         {
             // Note that we do not take a lock here.  When we replace s_entries, we preserve all
             // indices and Lock references.
-            return s_entries[syncIndex].Lock;
+            return UnsafeEntryRef(syncIndex).Lock;
         }
 
         private sealed class DeadEntryCollector
@@ -281,7 +303,7 @@ public DeadEntryCollector()
                     return;
 
                 Lock? lockToDispose = default;
-                DependentHandle dependentHadleToDispose = default;
+                DependentHandle dependentHandleToDispose = default;
 
                 using (LockHolder.Hold(s_lock))
                 {
@@ -294,7 +316,7 @@ public DeadEntryCollector()
                         return;
                     }
 
-                    dependentHadleToDispose = entry.Owner;
+                    dependentHandleToDispose = entry.Owner;
                     entry.Owner = default;
 
                     lockToDispose = entry.Lock;
@@ -305,7 +327,7 @@ public DeadEntryCollector()
                 }
 
                 // Dispose outside the lock
-                dependentHadleToDispose.Dispose();
+                dependentHandleToDispose.Dispose();
                 lockToDispose?.Dispose();
             }
         }
@@ -331,6 +353,10 @@ private struct Entry
             /// </summary>
             public DependentHandle Owner;
 
+            // Unused field to make the entry even number of words.
+            // we will likely put something in here eventually.
+            internal nint _unusedPadding;
+
             /// <summary>
             /// For entries in use, this property gets or sets the hash code of the owner object.
             /// The zero value indicates the hash code has not yet been assigned.
diff --git a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Thread.NativeAot.cs b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Thread.NativeAot.cs
index 543ba774c39a9..7c46ced230300 100644
--- a/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Thread.NativeAot.cs
+++ b/src/coreclr/nativeaot/System.Private.CoreLib/src/System/Threading/Thread.NativeAot.cs
@@ -320,9 +320,22 @@ public bool Join(int millisecondsTimeout)
         /// appropriate for the processor.
         /// TODO: See issue https://github.com/dotnet/corert/issues/4430
         /// </summary>
-        internal const int OptimalMaxSpinWaitsPerSpinIteration = 64;
+        internal const int OptimalMaxSpinWaitsPerSpinIteration = 8;
 
-        [MethodImpl(MethodImplOptions.NoInlining)]
+        // Max iterations to be done in RhSpinWait.
+        // RhSpinWait does not switch GC modes and we want to avoid native spinning in coop mode for too long.
+        private const int SpinWaitCoopThreshold = 1024;
+
+        internal static void SpinWaitInternal(int iterations)
+        {
+            Debug.Assert(iterations <= SpinWaitCoopThreshold);
+            if (iterations > 0)
+            {
+                RuntimeImports.RhSpinWait(iterations);
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.NoInlining)] // Slow path method. Make sure that the caller frame does not pay for PInvoke overhead.
         private static void LongSpinWait(int iterations)
         {
             RuntimeImports.RhLongSpinWait(iterations);
@@ -330,20 +343,13 @@ private static void LongSpinWait(int iterations)
 
         public static void SpinWait(int iterations)
         {
-            if (iterations <= 0)
-                return;
-
-            // Max iterations to be done in RhSpinWait.
-            // RhSpinWait does not switch GC modes and we want to avoid native spinning in coop mode for too long.
-            const int spinWaitCoopThreshold = 10000;
-
-            if (iterations > spinWaitCoopThreshold)
+            if (iterations > SpinWaitCoopThreshold)
             {
                 LongSpinWait(iterations);
             }
             else
             {
-                RuntimeImports.RhSpinWait(iterations);
+                SpinWaitInternal(iterations);
             }
         }