Fix sanitizer issues [BMQ,MWC] (bloomberg#373)

alexander-e1off · syuzvinsky · commit ea6e97375303 · 2024-08-30T09:23:39.000-04:00
diff --git a/etc/msansup.txt b/etc/msansup.txt
@@ -1,2 +1,4 @@
+# MemorySanitizer suppressions file for BlazingMQ.
+
 # Locking APIs from non-instrumented `libpthread` yields false-positives.
 fun:_ZN11BloombergLP5bslmt9MutexImplINS0_8Platform12PosixThreadsEE*
diff --git a/etc/tsansup.txt b/etc/tsansup.txt
@@ -1,34 +1,32 @@
-# ThreadSanitizer suppressions file for BMQ
+# ThreadSanitizer suppressions file for BlazingMQ.
 
 # There's a lengthy comment in ObjectPool::getObject that explains why this
 # isn't a race.  I'm not smart enough to figure out if it's right, but I'll
 # assume it's right
 race:BloombergLP::bdlcc::ObjectPool<*>::getObject
 race:BloombergLP::bdlcc::ObjectPool<*>::releaseObject
+
+# Issue 176120121 is created for tracking.
 race:BloombergLP::bdlma::ConcurrentPool::deallocate
 race:BloombergLP::bdlma::ConcurrentPool::allocate
 race:BloombergLP::bdlma::ConcurrentPool::deleteObject
 
-# Similar to above, there is a lengthy comment in bcema_Pool::allocate which
-# attempts to explain why there is no race.
-race:BloombergLP::bcema_Pool<*>::allocate
-
-# Not sure what the problem is here, but tsan can't show the other stack, so
-# there's nothing to look into
-race:__tsan_atomic32_fetch_add
-
 # Don't warn about using cout from multiple threads
 race:std::basic_ostream<char, *>& bsl::operator<< <*>(std::basic_ostream<*>&, bsl::basic_string<*> const&)
 
 # Looks like ball::LoggerManager uses a plain ptr to store its singleton, and
 # it makes tsan warn in some cases
 race:BloombergLP::ball::LoggerManager::isInitialized()
 
-# Suppress TSan report in a routine used in bmqimp::Brokersession test driver.
-# It is a benign race in the test driver, but should be looked into at some
-# point.
-race:TestSession::waitForChannelClose
-race:TestSession::arriveAtStepWithCfgs
+# Suppress sporadically appearing data race in bmqimp::BrokerSession test driver.
+# In TestSession::arriveAtStepWithCfgs() there is a call of queue->setOptions() method,
+# at nearly same time in other thread bmqimp::BrokerSession::onConfigureQueueResponse() calls
+# queue->options().suspendsOnBadHostHealth() method which is detected as data race.
+# bmqt::QueueOptions and bmqimp::Queue classes are not thread safe by design,
+# and bmqimp::BrokerSession::onConfigureQueueResponse() callback access them in
+# not thread-safe manner, probably also by design, assuming that it will be called again
+# if something is changed. Further investigation is required, suppress it for now.
+race:BloombergLP::bmqt::QueueOptions::suspendsOnBadHostHealth
 
 # Since we use mqbmock::Dispatcher in unit tests, this method does not get
 # enqueued correctly back to cluster-dispatcher thread, causing potential
diff --git a/etc/ubsansup.txt b/etc/ubsansup.txt
@@ -1,6 +1,2 @@
-# bmqp::Crc32c carries out misaligned access of Int64 in one of the internal
-# routines, but only on x86/64.  Misaligned accesses are handled gracefully by
-# this hardware, unlike SPARC.  So we simply suppress this warning.  Other
-# option is to update the code, which can be done by someone feeling
-# adventurous.
-alignment:crc32c1024SseInt
+# UndefinedBehaviorSanitizer suppressions file for BlazingMQ.
+# See details https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html#runtime-suppressions.
diff --git a/src/groups/bmq/bmqimp/bmqimp_brokersession.t.cpp b/src/groups/bmq/bmqimp/bmqimp_brokersession.t.cpp
@@ -2196,12 +2196,12 @@ bool TestSession::waitForChannelClose(const bsls::TimeInterval& timeout)
     // Wait for the close to be called on the base channel
     const bsls::TimeInterval expireAfter =
         bsls::SystemTime::nowRealtimeClock() + timeout;
-    while (d_testChannel.closeCalls().empty() &&
+    while (d_testChannel.closeCallsEmpty() &&
            bsls::SystemTime::nowRealtimeClock() < expireAfter) {
         bslmt::ThreadUtil::microSleep(k_TIME_SOURCE_STEP.totalMicroseconds());
     }
 
-    if (d_testChannel.closeCalls().empty()) {
+    if (d_testChannel.closeCallsEmpty()) {
         return false;  // RETURN
     }
 
diff --git a/src/groups/bmq/bmqp/bmqp_messageguidgenerator.t.cpp b/src/groups/bmq/bmqp/bmqp_messageguidgenerator.t.cpp
@@ -661,7 +661,11 @@ static void test3_multithreadUseIP()
 
     const int k_NUM_THREADS = 10;
 
-#if defined(__has_feature)
+#ifdef BSLS_PLATFORM_OS_SOLARIS
+    // This test case times out if 'k_NUM_GUIDS' is close to 1 million
+    // (it's unable to complete in 90 seconds).
+    const int k_NUM_GUIDS = 500000;  // 500k
+#elif defined(__has_feature)
     // Avoid timeout under MemorySanitizer
     const int k_NUM_GUIDS = __has_feature(memory_sanitizer) ? 500000    // 500k
                                                             : 1000000;  // 1M
diff --git a/src/groups/mqb/mqbs/mqbs_datastore.t.cpp b/src/groups/mqb/mqbs/mqbs_datastore.t.cpp
@@ -134,7 +134,10 @@ static void test2_defaultHashUniqueness()
 
     mwctst::TestHelper::printTestName("DEFAULT HASH UNIQUENESS");
 
-#if defined(__has_feature)
+#ifdef BSLS_PLATFORM_OS_SOLARIS
+    // This test case times out if 'k_NUM_GUIDS' is close to 1 million
+    const bsls::Types::Int64 k_NUM_KEYS = 1000000;  // 1M
+#elif defined(__has_feature)
     // Avoid timeout under MemorySanitizer
     const bsls::Types::Int64 k_NUM_KEYS = __has_feature(memory_sanitizer)
                                               ? 1000000    // 1M
diff --git a/src/groups/mqb/mqbu/mqbu_messageguidutil.t.cpp b/src/groups/mqb/mqbu/mqbu_messageguidutil.t.cpp
@@ -182,7 +182,11 @@ static void test2_multithread()
 
     const int k_NUM_THREADS = 10;
 
-#if defined(__has_feature)
+#ifdef BSLS_PLATFORM_OS_SOLARIS
+    // This test case times out if 'k_NUM_GUIDS' is close to 1 million
+    // (it's unable to complete in 90 seconds).
+    const int k_NUM_GUIDS = 500000;  // 500k
+#elif defined(__has_feature)
     // Avoid timeout under MemorySanitizer
     const int k_NUM_GUIDS = __has_feature(memory_sanitizer) ? 500000    // 500k
                                                             : 1000000;  // 1M
diff --git a/src/groups/mwc/mwcio/mwcio_ntcchannelfactory.t.cpp b/src/groups/mwc/mwcio/mwcio_ntcchannelfactory.t.cpp
@@ -98,19 +98,22 @@ static const ChannelWatermarkType::Enum WAT_HIGH =
 static const ChannelWatermarkType::Enum WAT_LOW =
     ChannelWatermarkType::e_LOW_WATERMARK;
 
+// Adjust attempt interval depending on the platform
 #ifdef BSLS_PLATFORM_OS_SOLARIS
-static const bool skipTest = true;
+static const int k_ATTEMPT_INTERVAL_MS = 300;
 #elif defined(                                                                \
     __has_feature)  // Clang-supported method for checking sanitizers.
-static const bool skipTest = __has_feature(memory_sanitizer) ||
-                             __has_feature(thread_sanitizer) ||
-                             __has_feature(undefined_behavior_sanitizer);
+static const int k_ATTEMPT_INTERVAL_MS =
+    (__has_feature(memory_sanitizer) || __has_feature(thread_sanitizer) ||
+     __has_feature(undefined_behavior_sanitizer))
+        ? 400
+        : 1;
 #elif defined(__SANITIZE_MEMORY__) || defined(__SANITIZE_THREAD__) ||         \
     defined(__SANITIZE_UNDEFINED__)
 // GCC-supported macros for checking MSAN, TSAN and UBSAN.
-static const bool skipTest = true;
+static const int k_ATTEMPT_INTERVAL_MS = 400;
 #else
-static const bool skipTest = false;  // Default to running the test.
+static const int k_ATTEMPT_INTERVAL_MS = 1;
 #endif
 
 // ========================
@@ -658,10 +661,9 @@ void Tester::connect(int                          line,
 
     ConnectOptions reqOptions(options, s_allocator_p);
 
-    reqOptions.setAttemptInterval(
-        bsls::TimeInterval(0, 10 * bdlt::TimeUnitRatio::k_NS_PER_MS));
-
-    reqOptions.setAttemptInterval(bsls::TimeInterval(0, 1));
+    reqOptions.setAttemptInterval(bsls::TimeInterval(
+        0,
+        k_ATTEMPT_INTERVAL_MS * bdlt::TimeUnitRatio::k_NS_PER_MS));
 
     HandleMap::iterator serverIter = d_handleMap.find(endpointOrServer);
     if (serverIter != d_handleMap.end()) {
@@ -1055,22 +1057,6 @@ static void test6_preCreationCbTest()
 {
     mwctst::TestHelper::printTestName("Pre Creation Cb Test");
 
-    if (skipTest) {
-        // This test has been disabled for Solaris/MSan/TSan/UBSan build. This
-        // test relies on the timings of certain callbacks being fired before
-        // or after certain operations. Normally this timing is always
-        // observed, but in msan/tsan/ubsan enabled build, the timing gets
-        // changed, leading to test failure. Of course, the right fix is to
-        // not rely on these timings, which can be worked on if the test starts
-        // failing in non-instrumented builds. Additionally, we could try to
-        // enable this test in MSan/TSan/UBSan build once all MSan/TSan/UBSan
-        // reports have been fixed to see if that helps (see `msansup.txt`,
-        // `tsansup.txt` and `ubsansup.txt`).
-        bsl::cout << "Test skipped (running on Solaris or under sanitizer)"
-                  << bsl::endl;
-        return;  // RETURN
-    }
-
     Tester t(s_allocator_p);
 
     // Concern 'a'
@@ -1106,22 +1092,6 @@ static void test5_visitChannelsTest()
 {
     mwctst::TestHelper::printTestName("Cancel Handle Test");
 
-    if (skipTest) {
-        // This test has been disabled for Solaris/MSan/TSan/UBSan build. This
-        // test relies on the timings of certain callbacks being fired before
-        // or after certain operations. Normally this timing is always
-        // observed, but in msan/tsan/ubsan enabled build, the timing gets
-        // changed, leading to test failure. Of course, the right fix is to
-        // not rely on these timings, which can be worked on if the test starts
-        // failing in non-instrumented builds. Additionally, we could try to
-        // enable this test in MSan/TSan/UBSan build once all MSan/TSan/UBSan
-        // reports have been fixed to see if that helps (see `msansup.txt`,
-        // `tsansup.txt` and `ubsansup.txt`).
-        bsl::cout << "Test skipped (running on Solaris or under sanitizer)"
-                  << bsl::endl;
-        return;  // RETURN
-    }
-
     Tester t(s_allocator_p);
 
     // Concerns 'a'
@@ -1162,22 +1132,6 @@ static void test4_cancelHandleTest()
 {
     mwctst::TestHelper::printTestName("Cancel Handle Test");
 
-    if (skipTest) {
-        // This test has been disabled for Solaris/MSan/TSan/UBSan build. This
-        // test relies on the timings of certain callbacks being fired before
-        // or after certain operations. Normally this timing is always
-        // observed, but in msan/tsan/ubsan enabled build, the timing gets
-        // changed, leading to test failure. Of course, the right fix is to
-        // not rely on these timings, which can be worked on if the test starts
-        // failing in non-instrumented builds. Additionally, we could try to
-        // enable this test in MSan/TSan/UBSan build once all MSan/TSan/UBSan
-        // reports have been fixed to see if that helps (see `msansup.txt`,
-        // `tsansup.txt` and `ubsansup.txt`).
-        bsl::cout << "Test skipped (running on Solaris or under sanitizer)"
-                  << bsl::endl;
-        return;  // RETURN
-    }
-
     Tester t(s_allocator_p);
 
     // Concerns 'a'
@@ -1232,22 +1186,6 @@ static void test3_watermarkTest()
 {
     mwctst::TestHelper::printTestName("Watermark Test");
 
-    if (skipTest) {
-        // This test has been disabled for Solaris/MSan/TSan/UBSan build. This
-        // test relies on the timings of certain callbacks being fired before
-        // or after certain operations. Normally this timing is always
-        // observed, but in msan/tsan/ubsan enabled build, the timing gets
-        // changed, leading to test failure. Of course, the right fix is to
-        // not rely on these timings, which can be worked on if the test starts
-        // failing in non-instrumented builds. Additionally, we could try to
-        // enable this test in MSan/TSan/UBSan build once all MSan/TSan/UBSan
-        // reports have been fixed to see if that helps (see `msansup.txt`,
-        // `tsansup.txt` and `ubsansup.txt`).
-        bsl::cout << "Test skipped (running on Solaris or under sanitizer)"
-                  << bsl::endl;
-        return;  // RETURN
-    }
-
     Tester t(s_allocator_p);
 
     // Concern 'a'
@@ -1337,22 +1275,6 @@ static void test1_breathingTest()
 {
     mwctst::TestHelper::printTestName("Breathing Test");
 
-    if (skipTest) {
-        // This test has been disabled for Solaris/MSan/TSan/UBSan build. This
-        // test relies on the timings of certain callbacks being fired before
-        // or after certain operations. Normally this timing is always
-        // observed, but in msan/tsan/ubsan enabled build, the timing gets
-        // changed, leading to test failure. Of course, the right fix is to
-        // not rely on these timings, which can be worked on if the test starts
-        // failing in non-instrumented builds. Additionally, we could try to
-        // enable this test in MSan/TSan/UBSan build once all MSan/TSan/UBSan
-        // reports have been fixed to see if that helps (see `msansup.txt`,
-        // `tsansup.txt` and `ubsansup.txt`).
-        bsl::cout << "Test skipped (running on Solaris or under sanitizer)"
-                  << bsl::endl;
-        return;  // RETURN
-    }
-
     Tester t(s_allocator_p);
     t.init(L_);
 
diff --git a/src/groups/mwc/mwcio/mwcio_testchannel.cpp b/src/groups/mwc/mwcio/mwcio_testchannel.cpp
@@ -293,5 +293,11 @@ bool TestChannel::hasNoMoreWriteCalls() const
     return d_hasNoMoreWriteCalls;
 }
 
+bool TestChannel::closeCallsEmpty() const
+{
+    bslmt::LockGuard<bslmt::Mutex> guard(&d_mutex);  // LOCK
+    return d_closeCalls.empty();
+}
+
 }  // close package namespace
 }  // close enterprise namespace
diff --git a/src/groups/mwc/mwcio/mwcio_testchannel.h b/src/groups/mwc/mwcio/mwcio_testchannel.h
@@ -184,7 +184,7 @@ class TestChannel : public Channel {
     bsl::deque<OnWatermarkCall> d_onWatermarkCalls;
     mwct::PropertyBag           d_properties;
     bsl::string                 d_peerUri;
-    bslmt::Mutex                d_mutex;
+    mutable bslmt::Mutex        d_mutex;
     bslmt::Condition            d_condition;
     bool                        d_isFinal;
     bool                        d_hasNoMoreWriteCalls;
@@ -258,6 +258,10 @@ class TestChannel : public Channel {
     const Status& writeStatus() const;
     bool          hasNoMoreWriteCalls() const;
 
+    /// Lock mutex and return `true` if d_closeCalls collection is empty,
+    /// `false` otherwise.
+    bool closeCallsEmpty() const;
+
     // Channel
     void read(Status*                   status,
               int                       numBytes,
diff --git a/src/groups/mwc/mwcma/mwcma_countingallocator.t.cpp b/src/groups/mwc/mwcma/mwcma_countingallocator.t.cpp
@@ -191,23 +191,24 @@ static void test3_deallocate()
     mwctst::TestHelper::printTestName("DEALLOCATE");
 
     // This test twice-deallocates the same block of memory, to verify that
-    // such an operation fails. If we're running under MemorySanitizer or
-    // AddressSanitizer, we must skip this test to avoid detecting the issue
-    // and aborting.
+    // such an operation fails. If we're running under MemorySanitizer,
+    // AddressSanitizer or ThreadSanitizer, we must skip this test to avoid
+    // detecting the issue and aborting.
     //
     // Under MSan, we would instead try to explicitly "unpoison" the memory,
     // but CountingAllocator keeps a hidden "header" block, which we have no
     // good way of accessing to unpoison.
     //
     // Under ASan, we might be able to use the `no_sanitize` attribute, but
-    // GCC doesn't support it before versio 8.0 - so for now, better just to
+    // GCC doesn't support it before version 8.0 - so for now, better just to
     // skip the testcase.
 #if defined(__has_feature)  // Clang-supported method for checking sanitizers.
     const bool skipTestForSanitizers = __has_feature(address_sanitizer) ||
                                        __has_feature(memory_sanitizer) ||
                                        __has_feature(thread_sanitizer);
-#elif defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
-    // GCC-supported macros for checking ASAN and TSAN.
+#elif defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_MEMORY__) ||        \
+    defined(__SANITIZE_THREAD__)
+    // GCC-supported macros for checking ASAN, MSAN and TSAN.
     const bool skipTestForSanitizers = true;
 #else
     const bool skipTestForSanitizers = false;  // Default to running the test.

Original file line number	Diff line number	Diff line change
`@@ -2196,12 +2196,12 @@ bool TestSession::waitForChannelClose(const bsls::TimeInterval& timeout)`
`2196`	`2196`	`// Wait for the close to be called on the base channel`
`2197`	`2197`	`const bsls::TimeInterval expireAfter =`
`2198`	`2198`	`bsls::SystemTime::nowRealtimeClock() + timeout;`
`2199`		`- while (d_testChannel.closeCalls().empty() &&`
	`2199`	`+ while (d_testChannel.closeCallsEmpty() &&`
`2200`	`2200`	`bsls::SystemTime::nowRealtimeClock() < expireAfter) {`
`2201`	`2201`	`bslmt::ThreadUtil::microSleep(k_TIME_SOURCE_STEP.totalMicroseconds());`
`2202`	`2202`	`}`
`2203`	`2203`
`2204`		`- if (d_testChannel.closeCalls().empty()) {`
	`2204`	`+ if (d_testChannel.closeCallsEmpty()) {`
`2205`	`2205`	`return false; // RETURN`
`2206`	`2206`	`}`
`2207`	`2207`