diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 2afe4b3912df..af688a1486af 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -3538,6 +3538,9 @@ RBBILineMonkey::~RBBILineMonkey() { // // type = char | word | line | sent | title // +// export = (path) Export test cases to (path)_(type).txt in the UCD +// test case format. +// // Example: // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1" // @@ -3974,6 +3977,8 @@ void RBBITest::TestMonkey() { UnicodeString breakType = "all"; Locale locale("en"); UBool useUText = false; + UBool scalarsOnly = false; + std::string exportPath; if (quick == false) { loopCount = 10000; @@ -3998,6 +4003,19 @@ void RBBITest::TestMonkey() { p = u.replaceFirst("", status); } + RegexMatcher pathMatcher(" *export *= *([^ ]+) *", p, 0, status); + if (pathMatcher.find()) { + pathMatcher.group(1, status).toUTF8String(exportPath); + pathMatcher.reset(); + p = pathMatcher.replaceFirst("", status); + } + + RegexMatcher s(" *scalars_only", p, 0, status); + if (s.find()) { + scalarsOnly = true; + s.reset(); + p = s.replaceFirst("", status); + } // m.reset(p); if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { @@ -4013,64 +4031,80 @@ void RBBITest::TestMonkey() { } if (breakType == "char" || breakType == "all") { + FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_char.txt").c_str(), "w"); RBBICharMonkey m; BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); if (U_SUCCESS(status)) { - RunMonkey(bi, m, "char", seed, loopCount, useUText); + RunMonkey(bi, m, "char", seed, loopCount, useUText, file, scalarsOnly); if (breakType == "all" && useUText==false) { // Also run a quick test with UText when "all" is specified - RunMonkey(bi, m, "char", seed, loopCount, true); + RunMonkey(bi, m, "char", seed, loopCount, true, nullptr, scalarsOnly); } } else { errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); } delete bi; + if (file != nullptr) { + fclose(file); + } } if (breakType == "word" || breakType == "all") { logln("Word Break Monkey Test"); + FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_word.txt").c_str(), "w"); RBBIWordMonkey m; BreakIterator *bi = BreakIterator::createWordInstance(locale, status); if (U_SUCCESS(status)) { - RunMonkey(bi, m, "word", seed, loopCount, useUText); + RunMonkey(bi, m, "word", seed, loopCount, useUText, file, scalarsOnly); } else { errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); } delete bi; + if (file != nullptr) { + fclose(file); + } } if (breakType == "line" || breakType == "all") { logln("Line Break Monkey Test"); + FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_line.txt").c_str(), "w"); RBBILineMonkey m; BreakIterator *bi = BreakIterator::createLineInstance(locale, status); if (loopCount >= 10) { loopCount = loopCount / 5; // Line break runs slower than the others. } if (U_SUCCESS(status)) { - RunMonkey(bi, m, "line", seed, loopCount, useUText); + RunMonkey(bi, m, "line", seed, loopCount, useUText, file, scalarsOnly); } else { errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); } delete bi; + if (file != nullptr) { + fclose(file); + } } if (breakType == "sent" || breakType == "all" ) { logln("Sentence Break Monkey Test"); + FILE *file = exportPath.empty() ? nullptr : fopen((exportPath + "_sent.txt").c_str(), "w"); RBBISentMonkey m; BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); if (loopCount >= 10) { loopCount = loopCount / 10; // Sentence runs slower than the other break types } if (U_SUCCESS(status)) { - RunMonkey(bi, m, "sent", seed, loopCount, useUText); + RunMonkey(bi, m, "sent", seed, loopCount, useUText, file, scalarsOnly); } else { errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); } delete bi; + if (file != nullptr) { + fclose(file); + } } #endif @@ -4079,14 +4113,19 @@ void RBBITest::TestMonkey() { // // Run a RBBI monkey test. Common routine, for all break iterator types. // Parameters: -// bi - the break iterator to use -// mk - MonkeyKind, abstraction for obtaining expected results -// name - Name of test (char, word, etc.) for use in error messages -// seed - Seed for starting random number generator (parameter from user) +// bi - the break iterator to use +// mk - MonkeyKind, abstraction for obtaining expected results +// name - Name of test (char, word, etc.) for use in error messages +// seed - Seed for starting random number generator (parameter from user) // numIterations +// exportFile - Pointer to a file to which the test cases will be written in +// UCD format. May be null. +// scalarsOnly - Only test sequences of Unicode scalar values; if this is false, +// arbitrary sequences of code points (including unpaired surrogates) +// are tested. // void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, - int32_t numIterations, UBool useUText) { + int32_t numIterations, UBool useUText, FILE *exportFile, UBool scalarsOnly) { #if !UCONFIG_NO_REGULAR_EXPRESSIONS @@ -4151,6 +4190,9 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name errln("%s:%d c < 0", __FILE__, __LINE__); break; } + if (scalarsOnly && U16_IS_SURROGATE(c)) { + continue; + } // Do not assemble a supplementary character from randomly generated separate surrogates. // (It could be a dictionary character) if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) { @@ -4267,6 +4309,16 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name } } + if (exportFile != nullptr) { + for (i = 0; i < testText.length();) { + fprintf(exportFile, expectedBreaks[i] ? "รท " : "ร— "); + char32_t const c = testText.char32At(i); + fprintf(exportFile, "%04X ", static_cast(c)); + i += U16_LENGTH(c); + } + fprintf(exportFile, expectedBreaks[testText.length()] ? "รท # ๐Ÿ’\n" : "ร— # ๐Ÿ’\n"); + } + // Compare the expected and actual results. for (i=0; i<=testText.length(); i++) { const char *errorType = nullptr; diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index 537a537863ad..3c2408ea2ca1 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -17,6 +17,8 @@ #if !UCONFIG_NO_BREAK_ITERATION +#include + #include #include "intltest.h" @@ -122,7 +124,7 @@ class RBBITest: public IntlTest { **/ void RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, - int32_t loopCount, UBool useUText); + int32_t loopCount, UBool useUText, FILE *exportFile, UBool scalarsOnly); // Run one of the Unicode Consortium boundary test data files. void runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi);