Skip to content

Commit

Permalink
feat:CJK 修改为中文
Browse files Browse the repository at this point in the history
  • Loading branch information
ricoa committed Apr 24, 2017
1 parent c0930db commit bbe2748
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 43 deletions.
2 changes: 1 addition & 1 deletion src/Correctors/CharacterCorrector.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public function handle($text)
$text= str_replace(getDBC(),getSBC(),$text);

//中文后使用全角中文标点
$text = preg_replace_callback("/([".getCJK().'])([!?\.,\(\):;\'\"])/iu', function($m){
$text = preg_replace_callback("/([".getCN().'])([!?\.,\(\):;\'\"])/iu', function($m){
$replace=[
'!'=>'',
'?'=>'',
Expand Down
54 changes: 27 additions & 27 deletions src/Correctors/SpaceCorrector.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

/**
*
* 在 CJK 字符与英文字符间添加空格
* 在中文字符与英文字符间添加空格
*
*update base on https://github.com/Rakume/pangu.php/blob/fc7d1c54ada1c85bb0e2725d41ce41b449eb3737/pangu.php
*
Expand All @@ -28,15 +28,15 @@ class SpaceCorrector extends Corrector{
*/
protected function __construct()
{
$cjk = getCJK();
$cn = getCN();
$this->patterns = array(
'cjk_quote' => array(
'([' . $cjk . '])(["\'])',
'cn_quote' => array(
'([' . $cn . '])(["\'])',
'$1 $2'
),

'quote_cjk' => array(
'(["\'])([' . $cjk . '])',
'quote_cn' => array(
'(["\'])([' . $cn . '])',
'$1 $2'
),

Expand All @@ -45,40 +45,40 @@ protected function __construct()
'$1$3$5'
),

'cjk_hash' => array(
'([' . $cjk . '])(#(\S+))',
'cn_hash' => array(
'([' . $cn . '])(#(\S+))',
'$1 $2'
),

'hash_cjk' => array(
'((\S+)#)([' . $cjk . '])',
'hash_cn' => array(
'((\S+)#)([' . $cn . '])',
'$1 $3'
),

'cjk_operator_ans' => array(
'([' . $cjk . '])([A-Za-z0-9])([\+\-\*\/=&\\|<>])',
'cn_operator_ans' => array(
'([' . $cn . '])([A-Za-z0-9])([\+\-\*\/=&\\|<>])',
'$1 $2 $3'
),

'ans_operator_cjk' => array(
'([\+\-\*\/=&\\|<>])([A-Za-z0-9])([' . $cjk . '])',
'ans_operator_cn' => array(
'([\+\-\*\/=&\\|<>])([A-Za-z0-9])([' . $cn . '])',
'$1 $2 $3'
),

'bracket' => array(
array(
'([' . $cjk . '])([<\[\{\(]+(.*?)[>\]\}\)]+)([' . $cjk . '])',
'([' . $cn . '])([<\[\{\(]+(.*?)[>\]\}\)]+)([' . $cn . '])',
'$1 $2 $4'
),

array(
'cjk_bracket' => array(
'([' . $cjk . '])([<>\[\]\{\}\(\)])',
'cn_bracket' => array(
'([' . $cn . '])([<>\[\]\{\}\(\)])',
'$1 $2'
),

'bracket_cjk' => array(
'([<>\[\]\{\}\(\)])([' . $cjk . '])',
'bracket_cn' => array(
'([<>\[\]\{\}\(\)])([' . $cn . '])',
'$1 $2'
)
)
Expand All @@ -89,13 +89,13 @@ protected function __construct()
'$1$3$5'
),

'cjk_ans' => array(
'([' . $cjk . '])([A-Za-z0-9`@&%\=\$\^\*\-\+\\/|\\\])',
'cn_ans' => array(
'([' . $cn . '])([A-Za-z0-9`@&%\=\$\^\*\-\+\\/|\\\])',
'$1 $2'
),

'ans_cjk' => array(
'([A-Za-z0-9`~!%&=;\|\,\.\:\?\$\^\*\-\+\/\\\])([' . $cjk . '])',
'ans_cn' => array(
'([A-Za-z0-9`~!%&=;\|\,\.\:\?\$\^\*\-\+\/\\\])([' . $cn . '])',
'$1 $2'
),
'number_letters'=>array(
Expand All @@ -106,12 +106,12 @@ protected function __construct()
"([ ]*)([\u{FF00}-\u{FFFF}。])([ ]*)",
'$2'
),
'cjk_greek'=>array(
'([' . $cjk . '0-9A-Za-z])([\p{Greek}])',
'cn_greek'=>array(
'([' . $cn . '0-9A-Za-z])([\p{Greek}])',
'$1 $2'
),
'greek_cjk'=>array(
'([\p{Greek}])([' . $cjk . '0-9A-Za-z])',
'greek_cn'=>array(
'([\p{Greek}])([' . $cn . '0-9A-Za-z])',
'$1 $2'
),
);
Expand Down
13 changes: 2 additions & 11 deletions src/helpers.php
Original file line number Diff line number Diff line change
@@ -1,16 +1,7 @@
<?php

function getCJK(){
return '' .
'\x{2e80}-\x{2eff}' .
'\x{2f00}-\x{2fdf}' .
'\x{3040}-\x{309f}' .
'\x{30a0}-\x{30ff}' .
'\x{3100}-\x{312f}' .
'\x{3200}-\x{32ff}' .
'\x{3400}-\x{4dbf}' .
'\x{4e00}-\x{9fff}' .
'\x{f900}-\x{faff}';
function getCN(){
return '\p{Han}';
}

//获得全角字符
Expand Down
8 changes: 4 additions & 4 deletions tests/unit/SpaceTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use Ricoa\CopyWritingCorrect\CopyWritingCorrectService;

/**
* 检测 CJK和半角字符之间增加空格
* 检测 中文和半角字符之间增加空格
*
* Class SpaceBetweenCjkAndEngTest
*/
Expand All @@ -22,13 +22,13 @@ public function testMe()
{
$service=new CopyWritingCorrectService();

//CJK 与英文
//中文与英文
$this->assertEquals(
'在 LeanCloud 上,数据存储是围绕 AVObject 进行的。每个 AVObject 都包含了与 JSON 兼容的 key-value 对应的数据。数据是 schema-free 的,你不需要在每个 AVObject 上提前指定存在哪些键,只要直接设定对应的 key-value 即可。',
$service->correct('在LeanCloud上,数据存储是围绕AVObject进行的。每个AVObject都包含了与JSON兼容的key-value对应的数据。数据是schema-free的,你不需要在每个AVObject上提前指定存在哪些键,只要直接设定对应的key-value即可。')
);

//CJK 与数字
//中文与数字
$this->assertEquals(
'今天出去买菜花了 5000 元。',
$service->correct('今天出去买菜花了5000元。')
Expand All @@ -46,7 +46,7 @@ public function testMe()
$service->correct('我家的光纤入户宽带有10Gbps ,SSD一共有20TB。 ')
);

//希腊字母与 CJK 字符以及数字英文字符之间添加空格
//希腊字母与中文字符以及数字英文字符之间添加空格
$this->assertEquals(
'电阻为 1 Ω',
$service->correct('电阻为1Ω')
Expand Down

0 comments on commit bbe2748

Please sign in to comment.