From cf49ba2362de69e4fd45aa1209bf9893ba95888a Mon Sep 17 00:00:00 2001 From: Shengfa <3363396+k7z45@users.noreply.github.com> Date: Tue, 20 Oct 2020 14:52:30 -0500 Subject: [PATCH] [atoms] Fix getText atom for unicode charater middle of word (#8736) * [atoms] Fix getText atom for unicode charater middle of word The change to getText atom by commit c065ddaeb6c5cf91ebf797c242128604e38b79e1 does not handle case when unicode character is in the middle of a word, and unicode character will be incorrectly capitalized (see https://bugs.chromium.org/p/chromedriver/issues/detail?id=3611). The problem is \b mark the boundary between word character and unicode character as a boundary. This is fixed by explicitly using unicode flag and specifying unicode character and unicode symbol. * [atoms] Add meta tag for charset in test html Add charset for utf-8 in text_test.html --- javascript/atoms/dom.js | 2 +- javascript/atoms/test/text_test.html | 29 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/javascript/atoms/dom.js b/javascript/atoms/dom.js index 68dbdacc163ee..3e683184e94f1 100644 --- a/javascript/atoms/dom.js +++ b/javascript/atoms/dom.js @@ -1175,7 +1175,7 @@ bot.dom.appendVisibleTextLinesFromTextNode_ = function(textNode, lines, } if (textTransform == 'capitalize') { - text = text.replace(/(^|\s|\b)(\S)/g, function() { + text = text.replace(/(^|[^\d\p{L}\p{S}])([\p{Ll}|\p{S}])/gu, function() { return arguments[1] + arguments[2].toUpperCase(); }); } else if (textTransform == 'uppercase') { diff --git a/javascript/atoms/test/text_test.html b/javascript/atoms/test/text_test.html index 4698ac5db46db..5dc0d1ccbaa4e 100644 --- a/javascript/atoms/test/text_test.html +++ b/javascript/atoms/test/text_test.html @@ -18,6 +18,7 @@ text_test.html +