Skip to content

Commit c1b6ec6

Browse files
authored
fix: 导入文档中含有特殊字符时,导入失败。 #363 (#372)
1 parent 7d62842 commit c1b6ec6

File tree

2 files changed

+16
-4
lines changed

2 files changed

+16
-4
lines changed

apps/common/util/ts_vecto_util.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from typing import List
1212

1313
import jieba
14+
import jieba.posseg
1415
from jieba import analyse
1516

1617
from common.util.split_model import group_by
@@ -25,7 +26,9 @@
2526
word_pattern_list = [r"v\d+.\d+.\d+",
2627
r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}"]
2728

28-
remove_chars = '\n , :\'<>!@#¥%……&*()!@#$%^&*(): ;,/"./-'
29+
remove_chars = '\n , :\'<>!@#¥%……&*()!@#$%^&*(): ;,/"./'
30+
31+
jieba_remove_flag_list = ['x', 'w']
2932

3033

3134
def get_word_list(text: str):
@@ -82,8 +85,11 @@ def to_ts_vector(text: str):
8285
# 替换字符串
8386
text = replace_word(word_dict, text)
8487
# 分词
85-
result = jieba.tokenize(text, mode='search')
86-
result_ = [{'word': get_key_by_word_dict(item[0], word_dict), 'index': item[1]} for item in result]
88+
result = jieba.posseg.lcut(text, HMM=True, use_paddle=True)
89+
# 过滤标点符号
90+
result = [item for item in result if not jieba_remove_flag_list.__contains__(item.flag)]
91+
result_ = [{'word': get_key_by_word_dict(result[index].word, word_dict), 'index': index} for index in
92+
range(len(result))]
8793
result_group = group_by(result_, lambda r: r['word'])
8894
return " ".join(
8995
[f"{key.lower()}:{','.join([str(item['index'] + 1) for item in result_group[key]][:20])}" for key in

apps/embedding/migrations/0002_embedding_search_vector.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Generated by Django 4.1.13 on 2024-04-16 11:43
2+
import threading
23

34
import django.contrib.postgres.search
45
from django.db import migrations
@@ -44,6 +45,11 @@ def save_keywords(apps, schema_editor):
4445
print(e)
4546

4647

48+
def async_save_keywords(apps, schema_editor):
49+
thread = threading.Thread(target=save_keywords, args=(apps, schema_editor))
50+
thread.start()
51+
52+
4753
class Migration(migrations.Migration):
4854
dependencies = [
4955
('embedding', '0001_initial'),
@@ -55,5 +61,5 @@ class Migration(migrations.Migration):
5561
name='search_vector',
5662
field=django.contrib.postgres.search.SearchVectorField(default='', verbose_name='分词'),
5763
),
58-
migrations.RunPython(save_keywords)
64+
migrations.RunPython(async_save_keywords)
5965
]

0 commit comments

Comments
 (0)