Skip to content

Commit

Permalink
feat(tokenizer): ensure distinct tokenization results
Browse files Browse the repository at this point in the history
Updated the tokenizer methods in TokenizerProcessor.kt to return distinct results. This change ensures that the tokenization process does not produce duplicate tokens. The distinct operation was removed from the RegexpTokenizer.kt as it is now handled in the main tokenizer methods.
  • Loading branch information
phodal committed Oct 4, 2024
1 parent 59c1f25 commit 09746ca
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ open class RegexpTokenizer(opts: RegexTokenizerOptions? = null) : Tokenizer {
results.ifEmpty { emptyList() }
}

return output.distinct()
return output
}

fun without(arr: List<String>, vararg values: String): List<String> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,22 @@ class TokenizerProcessor {
}
)

return tokenizer.tokenize(action.text)
return tokenizer.tokenize(action.text).distinct()
}

when (action.tokType) {
"word" -> {
val tokenizer = WordTokenizer()
return tokenizer.tokenize(action.text)
return tokenizer.tokenize(action.text).distinct()
}

"naming" -> {
val tokenizer = CodeNamingTokenizer()
return tokenizer.tokenize(action.text)
return tokenizer.tokenize(action.text).distinct()
}

"stopwords" -> {
return StopwordsBasedTokenizer.instance().tokenize(action.text)
return StopwordsBasedTokenizer.instance().tokenize(action.text).distinct()
}

"jieba" -> {
Expand All @@ -48,7 +48,7 @@ class TokenizerProcessor {

else -> {
val tokenizer = WordTokenizer()
return tokenizer.tokenize(action.text)
return tokenizer.tokenize(action.text).distinct()
}
}
}
Expand Down

0 comments on commit 09746ca

Please sign in to comment.