File tree Expand file tree Collapse file tree 1 file changed +7
-2
lines changed Expand file tree Collapse file tree 1 file changed +7
-2
lines changed Original file line number Diff line number Diff line change @@ -3079,7 +3079,7 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
30793079 else if (collecting_special && (codepoint_type (utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type (utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type (utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
30803080 split_condition = true ;
30813081 }
3082- else if (collecting_whitespace_lookahead && ( codepoint_type (utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type (utf_char_next) == CODEPOINT_TYPE_DIGIT) ) {
3082+ else if (collecting_whitespace_lookahead && codepoint_type (utf_char_next) != CODEPOINT_TYPE_WHITESPACE ) {
30833083 split_condition = true ;
30843084 }
30853085 }
@@ -3101,7 +3101,12 @@ static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
31013101 collecting_whitespace_lookahead = false ;
31023102 }
31033103 else {
3104- token += utf_char;
3104+ if (codepoint_type (token) == CODEPOINT_TYPE_PUNCTUATION && codepoint_type (utf_char) == CODEPOINT_TYPE_LETTER) {
3105+ bpe_words.emplace_back (token);
3106+ token = utf_char;
3107+ } else {
3108+ token += utf_char;
3109+ }
31053110 }
31063111 }
31073112
You can’t perform that action at this time.
0 commit comments