Skip to content

Commit 7f30d03

Browse files
authored
fix: 修复分词超过数据库最大限制 (#401)
1 parent 8159fef commit 7f30d03

File tree

1 file changed

+4
-3
lines changed

1 file changed

+4
-3
lines changed

apps/common/util/ts_vecto_util.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,11 @@ def to_ts_vector(text: str):
8585
# 替换字符串
8686
text = replace_word(word_dict, text)
8787
# 分词
88-
result = jieba.posseg.lcut(text, HMM=True, use_paddle=True)
88+
filter_word = jieba.analyse.extract_tags(text, topK=100)
89+
result = jieba.lcut(text, HMM=True, use_paddle=True)
8990
# 过滤标点符号
90-
result = [item for item in result if not jieba_remove_flag_list.__contains__(item.flag)]
91-
result_ = [{'word': get_key_by_word_dict(result[index].word, word_dict), 'index': index} for index in
91+
result = [item for item in result if filter_word.__contains__(item) and len(item) < 10]
92+
result_ = [{'word': get_key_by_word_dict(result[index], word_dict), 'index': index} for index in
9293
range(len(result))]
9394
result_group = group_by(result_, lambda r: r['word'])
9495
return " ".join(

0 commit comments

Comments
 (0)