diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 1a00069e6..83c5bbf81 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -46,10 +46,62 @@ jobs:
         with:
           platforms: all
 
-      - name: Build wheels on ${{ matrix.os }} using cibuildwheel
-        uses: pypa/cibuildwheel@v2.17
+      - name: Build 3.8 wheels on ${{ matrix.os }} using cibuildwheel
+        uses: pypa/cibuildwheel@v2.20
         env:
-          CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
+          CIBW_BUILD: "cp38-*"
+          CIBW_SKIP: "*-musllinux_*"
+          CIBW_TEST_SKIP: "cp38-macosx_*:arm64"
+          CIBW_ARCHS_LINUX: auto64 aarch64
+          CIBW_ARCHS_WINDOWS: auto64
+          CIBW_BEFORE_ALL_LINUX: bash .github/install_bazel.sh
+          # Grab the rootless Bazel installation inside the container.
+          CIBW_ENVIRONMENT_LINUX: PATH=$PATH:$HOME/bin
+          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+
+      - name: Build 3.9 wheels on ${{ matrix.os }} using cibuildwheel
+        uses: pypa/cibuildwheel@v2.20
+        env:
+          CIBW_BUILD: "cp39-*"
+          CIBW_SKIP: "*-musllinux_*"
+          CIBW_TEST_SKIP: "cp38-macosx_*:arm64"
+          CIBW_ARCHS_LINUX: auto64 aarch64
+          CIBW_ARCHS_WINDOWS: auto64
+          CIBW_BEFORE_ALL_LINUX: bash .github/install_bazel.sh
+          # Grab the rootless Bazel installation inside the container.
+          CIBW_ENVIRONMENT_LINUX: PATH=$PATH:$HOME/bin
+          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+
+      - name: Build 3.10 wheels on ${{ matrix.os }} using cibuildwheel
+        uses: pypa/cibuildwheel@v2.20
+        env:
+          CIBW_BUILD: "cp310-*"
+          CIBW_SKIP: "*-musllinux_*"
+          CIBW_TEST_SKIP: "cp38-macosx_*:arm64"
+          CIBW_ARCHS_LINUX: auto64 aarch64
+          CIBW_ARCHS_WINDOWS: auto64
+          CIBW_BEFORE_ALL_LINUX: bash .github/install_bazel.sh
+          # Grab the rootless Bazel installation inside the container.
+          CIBW_ENVIRONMENT_LINUX: PATH=$PATH:$HOME/bin
+          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+
+      - name: Build 3.11 wheels on ${{ matrix.os }} using cibuildwheel
+        uses: pypa/cibuildwheel@v2.20
+        env:
+          CIBW_BUILD: "cp311-*"
+          CIBW_SKIP: "*-musllinux_*"
+          CIBW_TEST_SKIP: "cp38-macosx_*:arm64"
+          CIBW_ARCHS_LINUX: auto64 aarch64
+          CIBW_ARCHS_WINDOWS: auto64
+          CIBW_BEFORE_ALL_LINUX: bash .github/install_bazel.sh
+          # Grab the rootless Bazel installation inside the container.
+          CIBW_ENVIRONMENT_LINUX: PATH=$PATH:$HOME/bin
+          CIBW_TEST_COMMAND: python {project}/bindings/python/google_benchmark/example.py
+
+      - name: Build 3.12 wheels on ${{ matrix.os }} using cibuildwheel
+        uses: pypa/cibuildwheel@v2.20
+        env:
+          CIBW_BUILD: "cp312-*"
           CIBW_SKIP: "*-musllinux_*"
           CIBW_TEST_SKIP: "cp38-macosx_*:arm64"
           CIBW_ARCHS_LINUX: auto64 aarch64
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 216c1c921..e0cd6962e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Require CMake 3.10. If available, use the policies up to CMake 3.22.
 cmake_minimum_required (VERSION 3.10...3.22)
 
-project (benchmark VERSION 1.8.5 LANGUAGES CXX)
+project (benchmark VERSION 1.9.0 LANGUAGES CXX)
 
 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
@@ -150,6 +150,10 @@ if (MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
 
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-WX)
+  endif()
+
   if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
     add_cxx_compiler_flag(-EHs-)
     add_cxx_compiler_flag(-EHa-)
diff --git a/MODULE.bazel b/MODULE.bazel
index 4210ea0be..e4f170c83 100644
--- a/MODULE.bazel
+++ b/MODULE.bazel
@@ -1,6 +1,6 @@
 module(
     name = "google_benchmark",
-    version = "1.8.5",
+    version = "1.9.0",
 )
 
 bazel_dep(name = "bazel_skylib", version = "1.5.0")
diff --git a/bindings/python/google_benchmark/example.py b/bindings/python/google_benchmark/example.py
index b5b2f88ff..b92245ea6 100644
--- a/bindings/python/google_benchmark/example.py
+++ b/bindings/python/google_benchmark/example.py
@@ -61,6 +61,7 @@ def skipped(state):
 
 
 @benchmark.register
+@benchmark.option.use_manual_time()
 def manual_timing(state):
     while state:
         # Manually count Python CPU time
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index 4cdb4515c..53a22247f 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -290,11 +290,50 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_OVERRIDE
 #endif
 
+#if defined(__GNUC__)
+// Determine the cacheline size based on architecture
+#if defined(__i386__) || defined(__x86_64__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#elif defined(__powerpc64__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 128
+#elif defined(__aarch64__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#elif defined(__arm__)
+// Cache line sizes for ARM: These values are not strictly correct since
+// cache line sizes depend on implementations, not architectures.  There
+// are even implementations with cache line sizes configurable at boot
+// time.
+#if defined(__ARM_ARCH_5T__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 32
+#elif defined(__ARM_ARCH_7A__)
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#endif  // ARM_ARCH
+#endif  // arches
+#endif  // __GNUC__
+
+#ifndef BENCHMARK_INTERNAL_CACHELINE_SIZE
+// A reasonable default guess.  Note that overestimates tend to waste more
+// space, while underestimates tend to waste more time.
+#define BENCHMARK_INTERNAL_CACHELINE_SIZE 64
+#endif
+
+#if defined(__GNUC__)
+// Indicates that the declared object be cache aligned using
+// `BENCHMARK_INTERNAL_CACHELINE_SIZE` (see above).
+#define BENCHMARK_INTERNAL_CACHELINE_ALIGNED \
+  __attribute__((aligned(BENCHMARK_INTERNAL_CACHELINE_SIZE)))
+#elif defined(_MSC_VER)
+#define BENCHMARK_INTERNAL_CACHELINE_ALIGNED \
+  __declspec(align(BENCHMARK_INTERNAL_CACHELINE_SIZE))
+#else
+#define BENCHMARK_INTERNAL_CACHELINE_ALIGNED
+#endif
+
 #if defined(_MSC_VER)
 #pragma warning(push)
 // C4251: <symbol> needs to have dll-interface to be used by clients of class
 #pragma warning(disable : 4251)
-#endif
+#endif  // _MSC_VER_
 
 namespace benchmark {
 class BenchmarkReporter;
@@ -757,9 +796,14 @@ enum Skipped
 
 }  // namespace internal
 
+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4324: 'benchmark::State': structure was padded due to alignment specifier
+#pragma warning(disable : 4324)
+#endif  // _MSC_VER_
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
-class BENCHMARK_EXPORT State {
+class BENCHMARK_EXPORT BENCHMARK_INTERNAL_CACHELINE_ALIGNED State {
  public:
   struct StateIterator;
   friend struct StateIterator;
@@ -1024,6 +1068,9 @@ class BENCHMARK_EXPORT State {
 
   friend class internal::BenchmarkInstance;
 };
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif  // _MSC_VER_
 
 inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
   return KeepRunningInternal(1, /*is_batch=*/false);
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
index 685f5c688..4045cd64f 100644
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@@ -130,14 +130,14 @@ BenchmarkReporter::Run CreateRunReport(
 void RunInThread(const BenchmarkInstance* b, IterationCount iters,
                  int thread_id, ThreadManager* manager,
                  PerfCountersMeasurement* perf_counters_measurement,
-                 ProfilerManager* profiler_manager) {
+                 ProfilerManager* profiler_manager_) {
   internal::ThreadTimer timer(
       b->measure_process_cpu_time()
           ? internal::ThreadTimer::CreateProcessCpuTime()
           : internal::ThreadTimer::Create());
 
   State st = b->Run(iters, thread_id, &timer, manager,
-                    perf_counters_measurement, profiler_manager);
+                    perf_counters_measurement, profiler_manager_);
   BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
       << "Benchmark returned before State::KeepRunning() returned false!";
   {