Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

添加phrase dict的支持 #62

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.DS_Store

# Created by https://www.gitignore.io/api/rust,code,intellij+all
# Edit at https://www.gitignore.io/?templates=rust,code,intellij+all
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "pinyin-data"]
path = pinyin-data
url = https://github.com/mozillazg/pinyin-data.git
[submodule "phrase-pinyin-data"]
path = phrase-pinyin-data
url = https://github.com/mozillazg/phrase-pinyin-data
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ include = [
]
edition = "2018"

[dependencies]
lazy_static = "1.4.0"
seq-macro = "0.3.2"

[workspace]
members = ["coverage-check"]

Expand Down
93 changes: 93 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ const RAW_DATA: &str = include_str!(concat!(
"/pinyin-data/pinyin.txt"
));

const RAW_PHRASE_DATA: &str = include_str!(concat!(
env!("CARGO_MANIFEST_DIR"),
"/phrase-pinyin-data/pinyin.txt"
));

#[cfg(any(
feature = "plain",
feature = "with_tone_num",
Expand Down Expand Up @@ -81,6 +86,7 @@ const TONE_NUMS: &[char] = &['0', '1', '2', '3', '4'];

type Style = (&'static str, fn(&str) -> Cow<'_, str>);
type InputData = Vec<(u32, Vec<&'static str>)>;
type PhraseInputData = HashMap<&'static str, Vec<&'static str>>;
type PinyinDataIndex = HashMap<&'static str, usize>;
type HeteronymDataIndex = HashMap<u32, usize>;

Expand All @@ -89,6 +95,10 @@ fn main() -> io::Result<()> {
let pinyin_index = generate_pinyin_data(&data)?;
let heteronym_index = generate_heteronym_table(&data, &pinyin_index)?;
generate_char_table(&data, &pinyin_index, &heteronym_index)?;

let phrase_data = build_phrase_data();
generate_phrase_table(&phrase_data, &pinyin_index)?;

// 输出这行以保证改动项目的其他文件不会触发编译脚本重新执行
println!("cargo:rerun-if-changed=build.rs");
Ok(())
Expand Down Expand Up @@ -144,6 +154,47 @@ fn build_data() -> InputData {
input_data
}

fn build_phrase_data() -> PhraseInputData {
let mut input_data: HashMap<&str, Vec<&str>> = HashMap::new();
RAW_PHRASE_DATA
.lines()
.enumerate()
// 移除注释和空格
.map(|(i, mut line)| {
if let Some(hash_pos) = line.find('#') {
line = &line[..hash_pos];
}
(i, line.trim())
})
// 移除空行
.filter(|(_, line)| !line.is_empty())
.for_each(|(i, line)| {
// Split the line by colon
let colon_pos = match line.find(':') {
Some(pos) => pos,
None => unreachable!("no colon found in line {}", i),
};
let phrase = line[..colon_pos].trim();
let pinyin = line[colon_pos + 1..].trim();

// 确保输入数据的字符全部在我们预料之中。
// 同时也可以提前知道一些被遗弃的码位,如: U+E7C8 和 U+E7C7
for syllable in pinyin.split(' ') {
for ch in syllable.chars() {
let is_known = LETTER_TABLE.contains(&ch);
assert!(
is_known,
"unknown character {:?} at line {}: {}",
ch, i, line,
);
}
}

input_data.entry(phrase).or_default().push(pinyin);
});
input_data
}

const STYLES: &[Style] = &[
#[cfg(feature = "plain")]
("plain", |input| {
Expand Down Expand Up @@ -327,6 +378,48 @@ fn generate_char_table(
Ok(())
}

// Important: Always stay in sync with value used in seq! in src/pinyin.rs
const MAX_PHRASE_LENGTH: usize = 9;

fn generate_phrase_table(data: &PhraseInputData, pinyin_index: &PinyinDataIndex) -> io::Result<()> {
// 输出字符表
let mut phrase_tables = (2..MAX_PHRASE_LENGTH + 1)
.map(|phrase_len| create_out_file(&format!("phrase_table_{}.rs", phrase_len)))
.collect::<Result<Vec<_>, _>>()?;
for table in &mut phrase_tables {
writeln!(table, "{{")?;
writeln!(table, "let mut m = HashMap::new();")?;
}
for (phrase, pinyins) in data {
// Skip phrases that are too long
if phrase.chars().count() > MAX_PHRASE_LENGTH {
continue;
}
let pinyin_indices: Vec<String> = pinyins
.iter()
.map(|pinyin| {
let pinyin = pinyin
.split(' ')
.map(|syllable| pinyin_index.get(syllable).unwrap().to_string())
.collect::<Vec<String>>()
.join(",");
format!("&[{}]", pinyin)
})
.collect();
writeln!(
phrase_tables[phrase.chars().count() - 2],
"m.insert(\"{}\", {});",
phrase,
pinyin_indices[0].to_string()
)?;
}
for table in &mut phrase_tables {
writeln!(table, "m")?;
writeln!(table, "}}")?;
}
Ok(())
}

fn create_out_file(name: &str) -> io::Result<impl Write> {
let path = Path::new(&env::var("OUT_DIR").unwrap()).join(name);
Ok(BufWriter::new(File::create(&path)?))
Expand Down
1 change: 1 addition & 0 deletions phrase-pinyin-data
Submodule phrase-pinyin-data added at 129b36
10 changes: 10 additions & 0 deletions src/data.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#![allow(clippy::unreadable_literal)]

use crate::{CharBlock, PinyinData};
use lazy_static::lazy_static;
use seq_macro::seq;
use std::collections::HashMap;

pub(crate) static PINYIN_DATA: &[PinyinData] =
include!(concat!(env!("OUT_DIR"), "/pinyin_data.rs"));
Expand All @@ -10,3 +13,10 @@ pub(crate) static HETERONYM_TABLE: &[&[u16]] =
include!(concat!(env!("OUT_DIR"), "/heteronym_table.rs"));

pub(crate) static CHAR_BLOCKS: &[CharBlock] = include!(concat!(env!("OUT_DIR"), "/char_blocks.rs"));

seq!(N in 2..=9 {
lazy_static! {
#(pub(crate) static ref PHRASE_TABLE_~N: HashMap<&'static str, &'static [u16; N]> =
HashMap::from(include!(concat!(env!("OUT_DIR"), "/phrase_table_", stringify!(N), ".rs")));)*
}
});
128 changes: 126 additions & 2 deletions src/pinyin.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use crate::data::PINYIN_DATA;
use crate::data::*;
use crate::{get_block_and_index, PinyinData};
use seq_macro::seq;
use std::convert::TryFrom;
use std::slice::Iter;
use std::str::Chars;

/// 单个字符的拼音信息
Expand Down Expand Up @@ -148,12 +150,134 @@ impl<'a> Iterator for PinyinStrIter<'a> {
}
}

/// *辅助迭代器*,用于获取词组串的拼音信息
pub struct PinyinPhraseIter<'a>(Iter<'a, &'a str>);

impl<'a> Iterator for PinyinPhraseIter<'a> {
type Item = Option<Vec<Pinyin>>;

#[inline]
fn next(&mut self) -> Option<Self::Item> {
self.0.next().map(|s| {
// Important: Always stay in sync with MAX_PHRASE_LENGTH in build.rs
seq!(N in 2..=9 {
match s.chars().count() {
1 => s
.chars()
.next()
.unwrap()
.to_pinyin()
.map(|pinyin| vec![pinyin]),
#(N => match PHRASE_TABLE_~N.get(s) {
Some(pinyin_indices) => Some(
pinyin_indices
.iter()
.map(|idx| Pinyin(&PINYIN_DATA[*idx as usize]))
.collect(),
),
None => {
s.to_pinyin().collect()
},
},)*
_ => s.to_pinyin().collect(),
}
})
})
}
}

/// 分词后给每一个词语注音,如果一个词语中含有不支持的字符,词语的注音为None,
/// 不然就生成一个Some(vec![..]),vec里面是词语中每个字的注音。
/// ```
/// # #[cfg(feature = "plain")] {
/// use pinyin::{ToPinyin, Pinyin};
/// let mut iter = ["薄荷", "是", "便宜货"].iter().to_pinyin();
/// let mut next_plain = || iter.next().map(|ps|
/// ps.map(|ps| ps.iter().map(|p|
/// Pinyin::plain(*p)).collect::<Vec<_>>()));
/// assert_eq!(next_plain(), Some(Some(vec!["bo", "he"])));
/// assert_eq!(next_plain(), Some(Some(vec!["shi"])));
/// assert_eq!(next_plain(), Some(Some(vec!["pian", "yi", "huo"])));
/// assert_eq!(next_plain(), None);
/// # }
/// ```
impl<'a> ToPinyin for Iter<'a, &'a str> {
type Output = PinyinPhraseIter<'a>;

fn to_pinyin(&self) -> Self::Output {
PinyinPhraseIter(self.to_owned())
}
}

#[cfg(test)]
mod tests {
use crate::ToPinyin;
use crate::{Pinyin, ToPinyin};

#[test]
fn special_code_point() {
assert!('\u{10FFFF}'.to_pinyin().is_none());
}

#[test]
fn phrase_pinyin() {
[
(vec!["重新"], vec![Some(vec!["chóng", "xīn"])]),
(vec!["同行"], vec![Some(vec!["tóng", "háng"])]),
// tone may change, wait for issue: https://github.com/mozillazg/phrase-pinyin-data/issues/43
(
vec!["便宜", "便宜货"],
vec![Some(vec!["pián", "yí"]), Some(vec!["pián", "yí", "huò"])],
),
(
vec!["贪便宜", "便宜从事"],
vec![
Some(vec!["tān", "pián", "yí"]),
Some(vec!["biàn", "yí", "cóng", "shì"]),
],
),
(vec!["打量"], vec![Some(vec!["dǎ", "liàng"])]),
(
vec!["薄荷", "薄弱", "衣服", "薄"],
vec![
Some(vec!["bò", "hé"]),
Some(vec!["bó", "ruò"]),
Some(vec!["yī", "fú"]),
Some(vec!["báo"]),
],
),
(
vec!["高血压", "流血"],
vec![Some(vec!["gāo", "xuè", "yā"]), Some(vec!["liú", "xiě"])],
),
// "大喝一声" is out-of-vocabulary right now,
// so we comment that out for now
(
vec![
// "大喝一声",
"喝水", "喝彩",
],
vec![
// Some(vec!["dà", "hè", "yī", "shēng"]),
Some(vec!["hē", "shuǐ"]),
Some(vec!["hè", "cǎi"]),
],
),
(vec!["\u{10FFFF}"], vec![None]),
(vec!["\u{10FFFF}你好"], vec![None]),
]
.iter()
.for_each(|(phrase, pinyin)| {
assert_eq!(
&phrase
.iter()
.to_pinyin()
.map(|pinyins| pinyins.map(|pinyins| pinyins
.iter()
.map(|p| Pinyin::with_tone(*p))
.collect::<Vec<_>>()))
.collect::<Vec<_>>(),
pinyin
)
})
}
}