Skip to content

Commit

Permalink
更新依赖及搜狗词库,增加合并输出词库
Browse files Browse the repository at this point in the history
  • Loading branch information
gshang2017 committed Jun 14, 2024
1 parent ee1ec7c commit 8bf848d
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 22 deletions.
15 changes: 8 additions & 7 deletions .github/workflows/rime_dict.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,20 @@ on:
workflow_dispatch:

env:
IMEWLCONVERTER_VER: 3.0.0
IMEWLCONVERTER_VER: 3.1.0
DOTNET_VER: '8.0'

jobs:
build:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v3

- name: Setup .NET Core
uses: actions/setup-dotnet@v3
with:
dotnet-version: 6.0.x
dotnet-version: 8.0.x

- name: set env
run: |
Expand All @@ -37,14 +38,13 @@ jobs:
#多音字
sed -i "1i\'cheng'du 成都" ./src/ImeWlConverterCore/Resources/WordPinyin.txt
dotnet build ./src/ImeWlConverterCmd
cd /home/runner/work/rime-dict/rime-dict/imewlconverter-${{ env.IMEWLCONVERTER_VER }}/src/ImeWlConverterCmd/bin/Debug/net6.0
cd /home/runner/work/rime-dict/rime-dict/imewlconverter-${{ env.IMEWLCONVERTER_VER }}/src/ImeWlConverterCmd/bin/Debug/net${{ env.DOTNET_VER }}
tar -zcf v${{ env.IMEWLCONVERTER_VER }}_imewlconverter_Linux_Mac.tar.gz *
cp -rf /home/runner/work/rime-dict/rime-dict/imewlconverter-${{ env.IMEWLCONVERTER_VER }}/src/ImeWlConverterCmd/bin/Debug/net6.0 /home/runner/work/rime-dict/rime-dict/imewlconverter
cp -rf /home/runner/work/rime-dict/rime-dict/imewlconverter-${{ env.IMEWLCONVERTER_VER }}/src/ImeWlConverterCmd/bin/Debug/net${{ env.DOTNET_VER }} /home/runner/work/rime-dict/rime-dict/imewlconverter
wget -P /home/runner/work/rime-dict/rime-dict https://github.com/gshang2017/rime-dict/releases/download/${{ env.TAG_NAME }}/dict.tar.gz
cd /home/runner/work/rime-dict/rime-dict/
tar -zxf dict.tar.gz
rm -rf dict.tar.gz
sudo pip install --upgrade setuptools==57.5.0
sudo pip install -r requirements.txt
tar -zcf dict.tar.gz dict.db
export SOGOU_OFFICIAL=True
Expand All @@ -59,6 +59,7 @@ jobs:
export WIKI_DICT=True
export CHAIZI_DICT=True
export IMEWLCONVERTER=False
export RIME_DICT_ALLINONE=True
python3 rime_dict.py
python3 sogou_pop_dict.py
cd /home/runner/work/rime-dict/rime-dict/output
Expand Down Expand Up @@ -89,4 +90,4 @@ jobs:
/home/runner/work/rime-dict/rime-dict/output/rime-dict.txt.tar.gz
/home/runner/work/rime-dict/rime-dict/output/rime-dict-non-tengxun-del.yaml.tar.gz
/home/runner/work/rime-dict/rime-dict/output/rime-dict-non-tengxun-del.txt.tar.gz
/home/runner/work/rime-dict/rime-dict/imewlconverter-3.0.0/src/ImeWlConverterCmd/bin/Debug/net6.0/v${{ env.IMEWLCONVERTER_VER }}_imewlconverter_Linux_Mac.tar.gz
/home/runner/work/rime-dict/rime-dict/imewlconverter-${{ env.IMEWLCONVERTER_VER }}/src/ImeWlConverterCmd/bin/Debug/net${{ env.DOTNET_VER }}/v${{ env.IMEWLCONVERTER_VER }}_imewlconverter_Linux_Mac.tar.gz
24 changes: 12 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,13 @@
| `chaizi_dict` |拆字词库|
| `lettered_word_dict` |字母词词库|
| `sogou_pop.dict` |搜狗流行词词库|
| `total_dict.dict` |除流行词词库外全部词库合并版|

### 用docker生成

* 详见:[https://github.com/gshang2017/rime-dict/tree/main/docker](https://github.com/gshang2017/rime-dict/tree/main/docker "https://github.com/gshang2017/rime-dict/tree/main/docker")

### 用命令行生成(ubuntu-20.04)
### 用命令行生成(ubuntu-22.04)

1.安装下载依赖

Expand All @@ -63,26 +64,21 @@

4.下载修改版imewlconverter

wget https://github.com/gshang2017/rime-dict/releases/download/2023.03.20/v3.0.0_imewlconverter_Linux_Mac.tar.gz
wget https://github.com/gshang2017/rime-dict/releases/download/2024.06.14/v3.1.0_imewlconverter_Linux_Mac.tar.gz
mkdir -p imewlconverter
tar -zxf v3.0.0_imewlconverter_Linux_Mac.tar.gz -C imewlconverter
tar -zxf v3.1.0_imewlconverter_Linux_Mac.tar.gz -C imewlconverter

5.下载rime-dict数据库

wget https://github.com/gshang2017/rime-dict/releases/download/2023.03.20/dict.tar.gz
wget https://github.com/gshang2017/rime-dict/releases/download/2024.06.14/dict.tar.gz
tar -zxf dict.tar.gz

6.安装rime-dict依赖

sudo apt install python3-pip
sudo pip install --upgrade setuptools==57.5.0
sudo pip install -r requirements.txt
sudo apt install -y opencc
wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
sudo dpkg -i packages-microsoft-prod.deb
rm packages-microsoft-prod.deb
sudo apt-get update
sudo apt install -y dotnet-sdk-6.0
sudo apt install -y dotnet-sdk-8.0

7.更新搜狗词库及词频(可选,因更新时间太长)

Expand All @@ -103,10 +99,14 @@
export LEN_NUM=7
export PREFIX_DICT_NAME=luna_pinyin_simp.

9.输出输入法文件(rime-dict/output)
9.合并输出全部词库

export RIME_DICT_ALLINONE=True

10.输出输入法文件(rime-dict/output)

python3 rime_dict.py

10.输出搜狗流行词词库输入法文件(rime-dict/output)
11.输出搜狗流行词词库输入法文件(rime-dict/output)

python3 sogou_pop_dict.py
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ beautifulsoup4
fake_useragent
pandas
datetime
python-csv
#python-csv
python-csv-demjson3
pypinyin
#imewlconverter https://github.com/gshang2017/rime-dict/releases
#opencc
#dotnet-sdk-6.0
#dotnet-sdk-8.0
38 changes: 38 additions & 0 deletions rime_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,19 @@ def order(filename_1):
table_order_dict.to_csv(filename_1+".dict.yaml",header=0,sep='\t',index=False,float_format='%.0f')
cursor.execute("DROP TABLE IF EXISTS tmp_order" )

#插入词库到tmp_allinone_dict表
def insert_dict_to_allinone(filename_1):
fr_allinone = open(filename_1+".dict.yaml", 'r',encoding='UTF-8')
for line in fr_allinone.readlines():
tmp_data = line.replace("\n","").split("\t", 2)
if int(len_num_set):
if len(tmp_data[0]) <= int(len_num_set):
cursor.execute("insert into tmp_allinone_dict values('%s','%s','%s')"%(tmp_data[0],tmp_data[1],tmp_data[2]))
else:
cursor.execute("insert into tmp_allinone_dict values('%s','%s','%s')"%(tmp_data[0],tmp_data[1],tmp_data[2]))
dict_db.commit()
fr_allinone.close()

#输出rime文件
def rime_yaml_output(filename_1,filename_2):

Expand Down Expand Up @@ -308,6 +321,11 @@ def conver_file(tablename,filename_1,filename_2):
fr2.close()
table_polyphonic_dict_new = sql.read_sql("SELECT * FROM tmp_polyphonic GROUP BY dict,dict_code" , dict_db)
table_polyphonic_dict_new.to_csv(filename_1+".dict.yaml",header=0,sep='\t',index=False,float_format='%.0f')
#合并输出
if rime_dict_allinone_set:
#插入词库到tmp_allinone_dict表
if yaml_dict_name.find("polyphonic_dict") == -1 and yaml_dict_name.find("english_dict") == -1 and yaml_dict_name.find("lettered_word_dict") == -1:
insert_dict_to_allinone(filename_1)
if order_set:
order(filename_1)
rime_yaml_output(filename_1,filename_2)
Expand Down Expand Up @@ -341,6 +359,7 @@ def conver_file(tablename,filename_1,filename_2):
no_finals_fix_set = os.getenv('NO_FINALS_FIX',default = 'True') == 'True'
lettered_word_non_delimiter_set = os.getenv('LETTERED_WORD_NON_DELIMITER',default = 'False') == 'True'
imewlconverter_set = os.getenv('IMEWLCONVERTER',default = 'True') == 'True'
rime_dict_allinone_set = os.getenv('RIME_DICT_ALLINONE',default = 'False') == 'True'

#创建表格
dict_db = sqlite3.connect('../dict.db')
Expand Down Expand Up @@ -369,6 +388,7 @@ def conver_file(tablename,filename_1,filename_2):
cursor.execute("CREATE TABLE IF NOT EXISTS tmp_wiki_dict (dict TEXT,dict_code TEXT,dict_frequency INTEGER)" )
cursor.execute("CREATE TABLE IF NOT EXISTS tmp_lettered_word_dict (dict TEXT,dict_code TEXT,dict_frequency INTEGER)" )
cursor.execute("CREATE TABLE IF NOT EXISTS tmp_all_dict (dict TEXT)" )
cursor.execute("CREATE TABLE IF NOT EXISTS tmp_allinone_dict (dict TEXT,dict_code TEXT,dict_frequency INTEGER)" )

#基础词库
if basic_dict_set:
Expand Down Expand Up @@ -452,6 +472,10 @@ def conver_file(tablename,filename_1,filename_2):
if not re.match(r"^[\t]",line_a):
file_out.write(line_a+' 1\n')
file_out.close()
#合并输出
if rime_dict_allinone_set:
#插入词库到tmp_allinone_dict表
insert_dict_to_allinone(filename_1)
rime_yaml_output(filename_1,filename_2)

#英语词库
Expand Down Expand Up @@ -670,10 +694,23 @@ def conver_file(tablename,filename_1,filename_2):
else:
file_lettered_word_out.write(line_a+' 1\n')
file_lettered_word_out.close()
#合并输出
if rime_dict_allinone_set:
#插入词库到tmp_allinone_dict表
insert_dict_to_allinone(filename_1)
if order_set:
order(filename_1)
rime_yaml_output(filename_1,filename_2)

#合并输出allinone文件
if rime_dict_allinone_set:
table_allinone_dict = sql.read_sql("SELECT * FROM tmp_allinone_dict GROUP BY length(dict),dict,dict_code" , dict_db)
filename_1 = prefix_dict_name+'total_dict'
filename_2 = '全部词汇'
yaml_dict_name = filename_1+".dict.yaml"
table_allinone_dict.to_csv(yaml_dict_name,header=0,sep='\t',index=False,float_format='%.0f')
rime_yaml_output(filename_1,filename_2)

#删除表格
cursor.execute("DROP TABLE IF EXISTS tmp_1" )
cursor.execute("DROP TABLE IF EXISTS tmp_2" )
Expand All @@ -687,6 +724,7 @@ def conver_file(tablename,filename_1,filename_2):
cursor.execute("DROP TABLE IF EXISTS tmp_wiki_dict" )
cursor.execute("DROP TABLE IF EXISTS tmp_lettered_word_dict" )
cursor.execute("DROP TABLE IF EXISTS tmp_all_dict" )
cursor.execute("DROP TABLE IF EXISTS tmp_allinone_dict" )
dict_db.commit()
cursor.execute("VACUUM" )
cursor.close()
Expand Down
1 change: 0 additions & 1 deletion sogou_pop_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
#
#sogou输入法网络流行新词
#https://pinyin.sogou.com/dict/detail/index/4
# 部署位置:
# ~/.config/ibus/rime (Linux ibus)
# ~/.config/fcitx/rime (Linux fcitx)
Expand Down

0 comments on commit 8bf848d

Please sign in to comment.