diff --git a/.github/workflows/rime_dict.yml b/.github/workflows/rime_dict.yml index 231c0bd..5d9ca49 100644 --- a/.github/workflows/rime_dict.yml +++ b/.github/workflows/rime_dict.yml @@ -7,11 +7,12 @@ on: workflow_dispatch: env: - IMEWLCONVERTER_VER: 3.0.0 + IMEWLCONVERTER_VER: 3.1.0 + DOTNET_VER: '8.0' jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout uses: actions/checkout@v3 @@ -19,7 +20,7 @@ jobs: - name: Setup .NET Core uses: actions/setup-dotnet@v3 with: - dotnet-version: 6.0.x + dotnet-version: 8.0.x - name: set env run: | @@ -37,14 +38,13 @@ jobs: #多音字 sed -i "1i\'cheng'du 成都" ./src/ImeWlConverterCore/Resources/WordPinyin.txt dotnet build ./src/ImeWlConverterCmd - cd /home/runner/work/rime-dict/rime-dict/imewlconverter-${{ env.IMEWLCONVERTER_VER }}/src/ImeWlConverterCmd/bin/Debug/net6.0 + cd /home/runner/work/rime-dict/rime-dict/imewlconverter-${{ env.IMEWLCONVERTER_VER }}/src/ImeWlConverterCmd/bin/Debug/net${{ env.DOTNET_VER }} tar -zcf v${{ env.IMEWLCONVERTER_VER }}_imewlconverter_Linux_Mac.tar.gz * - cp -rf /home/runner/work/rime-dict/rime-dict/imewlconverter-${{ env.IMEWLCONVERTER_VER }}/src/ImeWlConverterCmd/bin/Debug/net6.0 /home/runner/work/rime-dict/rime-dict/imewlconverter + cp -rf /home/runner/work/rime-dict/rime-dict/imewlconverter-${{ env.IMEWLCONVERTER_VER }}/src/ImeWlConverterCmd/bin/Debug/net${{ env.DOTNET_VER }} /home/runner/work/rime-dict/rime-dict/imewlconverter wget -P /home/runner/work/rime-dict/rime-dict https://github.com/gshang2017/rime-dict/releases/download/${{ env.TAG_NAME }}/dict.tar.gz cd /home/runner/work/rime-dict/rime-dict/ tar -zxf dict.tar.gz rm -rf dict.tar.gz - sudo pip install --upgrade setuptools==57.5.0 sudo pip install -r requirements.txt tar -zcf dict.tar.gz dict.db export SOGOU_OFFICIAL=True @@ -59,6 +59,7 @@ jobs: export WIKI_DICT=True export CHAIZI_DICT=True export IMEWLCONVERTER=False + export RIME_DICT_ALLINONE=True python3 rime_dict.py python3 sogou_pop_dict.py cd /home/runner/work/rime-dict/rime-dict/output @@ -89,4 +90,4 @@ jobs: /home/runner/work/rime-dict/rime-dict/output/rime-dict.txt.tar.gz /home/runner/work/rime-dict/rime-dict/output/rime-dict-non-tengxun-del.yaml.tar.gz /home/runner/work/rime-dict/rime-dict/output/rime-dict-non-tengxun-del.txt.tar.gz - /home/runner/work/rime-dict/rime-dict/imewlconverter-3.0.0/src/ImeWlConverterCmd/bin/Debug/net6.0/v${{ env.IMEWLCONVERTER_VER }}_imewlconverter_Linux_Mac.tar.gz + /home/runner/work/rime-dict/rime-dict/imewlconverter-${{ env.IMEWLCONVERTER_VER }}/src/ImeWlConverterCmd/bin/Debug/net${{ env.DOTNET_VER }}/v${{ env.IMEWLCONVERTER_VER }}_imewlconverter_Linux_Mac.tar.gz diff --git a/README.md b/README.md index f2c510b..3b8bdd8 100644 --- a/README.md +++ b/README.md @@ -41,12 +41,13 @@ | `chaizi_dict` |拆字词库| | `lettered_word_dict` |字母词词库| | `sogou_pop.dict` |搜狗流行词词库| +| `total_dict.dict` |除流行词词库外全部词库合并版| ### 用docker生成 * 详见:[https://github.com/gshang2017/rime-dict/tree/main/docker](https://github.com/gshang2017/rime-dict/tree/main/docker "https://github.com/gshang2017/rime-dict/tree/main/docker") -### 用命令行生成(ubuntu-20.04) +### 用命令行生成(ubuntu-22.04) 1.安装下载依赖 @@ -63,26 +64,21 @@ 4.下载修改版imewlconverter - wget https://github.com/gshang2017/rime-dict/releases/download/2023.03.20/v3.0.0_imewlconverter_Linux_Mac.tar.gz + wget https://github.com/gshang2017/rime-dict/releases/download/2024.06.14/v3.1.0_imewlconverter_Linux_Mac.tar.gz mkdir -p imewlconverter - tar -zxf v3.0.0_imewlconverter_Linux_Mac.tar.gz -C imewlconverter + tar -zxf v3.1.0_imewlconverter_Linux_Mac.tar.gz -C imewlconverter 5.下载rime-dict数据库 - wget https://github.com/gshang2017/rime-dict/releases/download/2023.03.20/dict.tar.gz + wget https://github.com/gshang2017/rime-dict/releases/download/2024.06.14/dict.tar.gz tar -zxf dict.tar.gz 6.安装rime-dict依赖 sudo apt install python3-pip - sudo pip install --upgrade setuptools==57.5.0 sudo pip install -r requirements.txt sudo apt install -y opencc - wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb - sudo dpkg -i packages-microsoft-prod.deb - rm packages-microsoft-prod.deb - sudo apt-get update - sudo apt install -y dotnet-sdk-6.0 + sudo apt install -y dotnet-sdk-8.0 7.更新搜狗词库及词频(可选,因更新时间太长) @@ -103,10 +99,14 @@ export LEN_NUM=7 export PREFIX_DICT_NAME=luna_pinyin_simp. -9.输出输入法文件(rime-dict/output) +9.合并输出全部词库 + + export RIME_DICT_ALLINONE=True + +10.输出输入法文件(rime-dict/output) python3 rime_dict.py -10.输出搜狗流行词词库输入法文件(rime-dict/output) +11.输出搜狗流行词词库输入法文件(rime-dict/output) python3 sogou_pop_dict.py diff --git a/requirements.txt b/requirements.txt index c934c65..bd84206 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,8 +4,9 @@ beautifulsoup4 fake_useragent pandas datetime -python-csv +#python-csv +python-csv-demjson3 pypinyin #imewlconverter https://github.com/gshang2017/rime-dict/releases #opencc -#dotnet-sdk-6.0 +#dotnet-sdk-8.0 diff --git a/rime_dict.py b/rime_dict.py index c8e9a8c..3cf3604 100644 --- a/rime_dict.py +++ b/rime_dict.py @@ -114,6 +114,19 @@ def order(filename_1): table_order_dict.to_csv(filename_1+".dict.yaml",header=0,sep='\t',index=False,float_format='%.0f') cursor.execute("DROP TABLE IF EXISTS tmp_order" ) +#插入词库到tmp_allinone_dict表 +def insert_dict_to_allinone(filename_1): + fr_allinone = open(filename_1+".dict.yaml", 'r',encoding='UTF-8') + for line in fr_allinone.readlines(): + tmp_data = line.replace("\n","").split("\t", 2) + if int(len_num_set): + if len(tmp_data[0]) <= int(len_num_set): + cursor.execute("insert into tmp_allinone_dict values('%s','%s','%s')"%(tmp_data[0],tmp_data[1],tmp_data[2])) + else: + cursor.execute("insert into tmp_allinone_dict values('%s','%s','%s')"%(tmp_data[0],tmp_data[1],tmp_data[2])) + dict_db.commit() + fr_allinone.close() + #输出rime文件 def rime_yaml_output(filename_1,filename_2): @@ -308,6 +321,11 @@ def conver_file(tablename,filename_1,filename_2): fr2.close() table_polyphonic_dict_new = sql.read_sql("SELECT * FROM tmp_polyphonic GROUP BY dict,dict_code" , dict_db) table_polyphonic_dict_new.to_csv(filename_1+".dict.yaml",header=0,sep='\t',index=False,float_format='%.0f') + #合并输出 + if rime_dict_allinone_set: + #插入词库到tmp_allinone_dict表 + if yaml_dict_name.find("polyphonic_dict") == -1 and yaml_dict_name.find("english_dict") == -1 and yaml_dict_name.find("lettered_word_dict") == -1: + insert_dict_to_allinone(filename_1) if order_set: order(filename_1) rime_yaml_output(filename_1,filename_2) @@ -341,6 +359,7 @@ def conver_file(tablename,filename_1,filename_2): no_finals_fix_set = os.getenv('NO_FINALS_FIX',default = 'True') == 'True' lettered_word_non_delimiter_set = os.getenv('LETTERED_WORD_NON_DELIMITER',default = 'False') == 'True' imewlconverter_set = os.getenv('IMEWLCONVERTER',default = 'True') == 'True' +rime_dict_allinone_set = os.getenv('RIME_DICT_ALLINONE',default = 'False') == 'True' #创建表格 dict_db = sqlite3.connect('../dict.db') @@ -369,6 +388,7 @@ def conver_file(tablename,filename_1,filename_2): cursor.execute("CREATE TABLE IF NOT EXISTS tmp_wiki_dict (dict TEXT,dict_code TEXT,dict_frequency INTEGER)" ) cursor.execute("CREATE TABLE IF NOT EXISTS tmp_lettered_word_dict (dict TEXT,dict_code TEXT,dict_frequency INTEGER)" ) cursor.execute("CREATE TABLE IF NOT EXISTS tmp_all_dict (dict TEXT)" ) +cursor.execute("CREATE TABLE IF NOT EXISTS tmp_allinone_dict (dict TEXT,dict_code TEXT,dict_frequency INTEGER)" ) #基础词库 if basic_dict_set: @@ -452,6 +472,10 @@ def conver_file(tablename,filename_1,filename_2): if not re.match(r"^[\t]",line_a): file_out.write(line_a+' 1\n') file_out.close() + #合并输出 + if rime_dict_allinone_set: + #插入词库到tmp_allinone_dict表 + insert_dict_to_allinone(filename_1) rime_yaml_output(filename_1,filename_2) #英语词库 @@ -670,10 +694,23 @@ def conver_file(tablename,filename_1,filename_2): else: file_lettered_word_out.write(line_a+' 1\n') file_lettered_word_out.close() + #合并输出 + if rime_dict_allinone_set: + #插入词库到tmp_allinone_dict表 + insert_dict_to_allinone(filename_1) if order_set: order(filename_1) rime_yaml_output(filename_1,filename_2) + #合并输出allinone文件 +if rime_dict_allinone_set: + table_allinone_dict = sql.read_sql("SELECT * FROM tmp_allinone_dict GROUP BY length(dict),dict,dict_code" , dict_db) + filename_1 = prefix_dict_name+'total_dict' + filename_2 = '全部词汇' + yaml_dict_name = filename_1+".dict.yaml" + table_allinone_dict.to_csv(yaml_dict_name,header=0,sep='\t',index=False,float_format='%.0f') + rime_yaml_output(filename_1,filename_2) + #删除表格 cursor.execute("DROP TABLE IF EXISTS tmp_1" ) cursor.execute("DROP TABLE IF EXISTS tmp_2" ) @@ -687,6 +724,7 @@ def conver_file(tablename,filename_1,filename_2): cursor.execute("DROP TABLE IF EXISTS tmp_wiki_dict" ) cursor.execute("DROP TABLE IF EXISTS tmp_lettered_word_dict" ) cursor.execute("DROP TABLE IF EXISTS tmp_all_dict" ) +cursor.execute("DROP TABLE IF EXISTS tmp_allinone_dict" ) dict_db.commit() cursor.execute("VACUUM" ) cursor.close() diff --git a/sogou_pop_dict.py b/sogou_pop_dict.py index ce44b35..a1c7841 100644 --- a/sogou_pop_dict.py +++ b/sogou_pop_dict.py @@ -41,7 +41,6 @@ # #sogou输入法网络流行新词 #https://pinyin.sogou.com/dict/detail/index/4 - # 部署位置: # ~/.config/ibus/rime (Linux ibus) # ~/.config/fcitx/rime (Linux fcitx)