File tree Expand file tree Collapse file tree 2 files changed +16
-4
lines changed Expand file tree Collapse file tree 2 files changed +16
-4
lines changed Original file line number Diff line number Diff line change 11
11
from typing import List
12
12
13
13
import jieba
14
+ import jieba .posseg
14
15
from jieba import analyse
15
16
16
17
from common .util .split_model import group_by
25
26
word_pattern_list = [r"v\d+.\d+.\d+" ,
26
27
r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}" ]
27
28
28
- remove_chars = '\n , :\' <>!@#¥%……&*()!@#$%^&*(): ;,/"./-'
29
+ remove_chars = '\n , :\' <>!@#¥%……&*()!@#$%^&*(): ;,/"./'
30
+
31
+ jieba_remove_flag_list = ['x' , 'w' ]
29
32
30
33
31
34
def get_word_list (text : str ):
@@ -82,8 +85,11 @@ def to_ts_vector(text: str):
82
85
# 替换字符串
83
86
text = replace_word (word_dict , text )
84
87
# 分词
85
- result = jieba .tokenize (text , mode = 'search' )
86
- result_ = [{'word' : get_key_by_word_dict (item [0 ], word_dict ), 'index' : item [1 ]} for item in result ]
88
+ result = jieba .posseg .lcut (text , HMM = True , use_paddle = True )
89
+ # 过滤标点符号
90
+ result = [item for item in result if not jieba_remove_flag_list .__contains__ (item .flag )]
91
+ result_ = [{'word' : get_key_by_word_dict (result [index ].word , word_dict ), 'index' : index } for index in
92
+ range (len (result ))]
87
93
result_group = group_by (result_ , lambda r : r ['word' ])
88
94
return " " .join (
89
95
[f"{ key .lower ()} :{ ',' .join ([str (item ['index' ] + 1 ) for item in result_group [key ]][:20 ])} " for key in
Original file line number Diff line number Diff line change 1
1
# Generated by Django 4.1.13 on 2024-04-16 11:43
2
+ import threading
2
3
3
4
import django .contrib .postgres .search
4
5
from django .db import migrations
@@ -44,6 +45,11 @@ def save_keywords(apps, schema_editor):
44
45
print (e )
45
46
46
47
48
+ def async_save_keywords (apps , schema_editor ):
49
+ thread = threading .Thread (target = save_keywords , args = (apps , schema_editor ))
50
+ thread .start ()
51
+
52
+
47
53
class Migration (migrations .Migration ):
48
54
dependencies = [
49
55
('embedding' , '0001_initial' ),
@@ -55,5 +61,5 @@ class Migration(migrations.Migration):
55
61
name = 'search_vector' ,
56
62
field = django .contrib .postgres .search .SearchVectorField (default = '' , verbose_name = '分词' ),
57
63
),
58
- migrations .RunPython (save_keywords )
64
+ migrations .RunPython (async_save_keywords )
59
65
]
You can’t perform that action at this time.
0 commit comments