Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add specific rules for BGN/PCGN romanization of Russian #10

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Transliterator/DataLoader.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public function __construct()
public function getTransliterationMap($path, $alphabet)
{
// Valdate
if (!in_array($alphabet, array(Settings::ALPHABET_CYR, Settings::ALPHABET_LAT))) {
if (!in_array($alphabet, array(Settings::ALPHABET_CYR, Settings::ALPHABET_LAT, Settings::ALPHABET_CYR_REGEXP, Settings::ALPHABET_LAT_REGEXP))) {
throw new \InvalidArgumentException(sprintf('Alphabet "%s" is not recognized.', $alphabet));
}

Expand Down
10 changes: 10 additions & 0 deletions src/Transliterator/Settings.php
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,16 @@ class Settings
*/
const ALPHABET_LAT = 'lat';

/**
* Cyrillic additional replacement rules
*/
const ALPHABET_CYR_REGEXP = 'cyr_regexp';

/**
* Latin additional replacement rules
*/
const ALPHABET_LAT_REGEXP = 'lat_regexp';

/**
* ISO 639-1 language code.
*
Expand Down
67 changes: 67 additions & 0 deletions src/Transliterator/Transliterator.php
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,20 @@ class Transliterator
*/
protected $latMap;

/**
* Cyrillic replacement rules.
*
* @var array
*/
protected $cyrReplacement;

/**
* Latin replacement rules.
*
* @var array
*/
protected $latReplacement;

/**
* Transliterator constructor.
*
Expand Down Expand Up @@ -125,6 +139,27 @@ public function lat2Cyr($text)
return $this->transliterate($text, false);
}

private function preReplace($rules, $text)
{
if($rules)
{
$preparedPatterns = array_map(
function ($item)
{
return str_replace('/', '`', $item) . 'u';
},
$rules['pattern']
);
return preg_replace(
$preparedPatterns,
$rules['replacement'],
$text
);
}

return $text;
}

/**
* Transliterates cyrillic text to latin and vice versa
* depending on $direction parameter.
Expand All @@ -136,8 +171,12 @@ public function lat2Cyr($text)
public function transliterate($text, $direction)
{
if ($direction) {
$text = $this->preReplace($this->getCyrReplacement(), $text);

return str_replace($this->getCyrMap(), $this->getLatMap(), $text);
} else {
$text = $this->preReplace($this->getLatReplacement(), $text);

return str_replace($this->getLatMap(), $this->getCyrMap(), $text);
}
}
Expand Down Expand Up @@ -170,6 +209,34 @@ public function getLatMap()
return $this->latMap;
}

/**
* Get cyrillic char map.
*
* @return array cyrillic char map
*/
public function getCyrReplacement()
{
if (null === $this->cyrReplacement) {
$this->cyrReplacement = $this->getTransliterationMap(Settings::ALPHABET_CYR_REGEXP);
}

return $this->cyrReplacement;
}

/**
* Get latin char map.
*
* @return array latin char map
*/
public function getLatReplacement()
{
if (null === $this->latReplacement) {
$this->latReplacement = $this->getTransliterationMap(Settings::ALPHABET_LAT_REGEXP);
}

return $this->latReplacement;
}

/**
* Get trasnsliteration char map.
*
Expand Down
31 changes: 28 additions & 3 deletions src/Transliterator/data/ru/BGN_PCGN.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,35 @@
'а', 'б', 'в', 'г', 'д', 'е', 'ё', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'ъ', 'ы', 'ь', 'э',
'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ё', 'З', 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П', 'Р', 'С', 'Т', 'У', 'Ф', 'Ъ', 'Ы', 'Ь', 'Э'
),

'cyr_regexp' => array(
'pattern' => array(
'/^Е/', '/^е/', '/([\sъьйЪЬЙуеыаоэяиюУЕЫАОЭЯИЮ])Е/', '/([\sъьйЪЬЙуеыаоэяиюУЕЫАОЭЯИЮ])е/',
'/^Ё/', '/^ё/', '/([\sъьйЪЬЙуеыаоэяиюУЕЫАОЭЯИЮ])Ё/', '/([\sъьйЪЬЙуеыаоэяиюУЕЫАОЭЯИЮ])ё/',
),
'replacement' => array(
'Ye', 'ye', '$1Ye', '$1ye',
"Yë", "yë", "\$1Yë", "\$1yë",
)
),

'lat' => array(
'shch', 'zh', 'kh', 'ts', 'ch', 'sh', 'yu', 'ya',
'Shch', 'Zh', 'Kh', 'Ts', 'Ch', 'Sh', 'Yu', 'Ya',
'a', 'b', 'v', 'g', 'd', 'ye', 'yë', 'z', 'i', 'y', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'f', 'ˮ', 'y', 'ʼ', 'e',
'A', 'B', 'V', 'G', 'D', 'Ye', 'Yë', 'Z', 'I', 'Y', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'F', 'ˮ', 'Y', 'ʼ', 'E'
)
'a', 'b', 'v', 'g', 'd', 'e', 'ë', 'z', 'i', 'y', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'f', 'ˮ', 'y', 'ʼ', 'e',
'A', 'B', 'V', 'G', 'D', 'E', 'Ë', 'Z', 'I', 'Y', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'F', 'ˮ', 'Y', 'ʼ', 'E'
),

'lat_regexp' => array(
'pattern' => array(
'/Ye/', '/ye/',
'/Y[ёë]/', '/y[ёë]/',
'/([^euioaEUIOA\s])y/'
),
'replacement' => array(
'E', 'е',
"Ё", "ё",
'$1ы'
)
),
);
28 changes: 26 additions & 2 deletions tests/Transliterator/TransliteratorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -316,8 +316,32 @@ public static function testRussianBGNPCGNProvider()
return array(
array('Ю ю', 'Yu yu', false),
array('Я я', 'Ya ya', false),
array('Э э', 'E e', false),
array('E e', 'Э э', true)
// array('Э э', 'E e', false),
// array('E e', 'Э э', true)
);
}

/**
* @dataProvider testRussianBGNPCGNSpecificProvider
*/
public function testRussianBGNPCGNSpecific($expected, $actual, $direction)
{
$transliterated = self::$transliteratorRu->setSystem(Settings::SYSTEM_BGN_PCGN)->transliterate($actual, $direction);

$this->assertEquals($expected, $transliterated);

$this->assertEquals($actual, self::$transliteratorRu->setSystem(Settings::SYSTEM_BGN_PCGN)->transliterate($transliterated, !$direction));
}

public static function testRussianBGNPCGNSpecificProvider()
{
return array(
array(
'Yekaterinburg Yekaterinburg Yurʼyev Sˮyezd Chapayevsk Belkin Ozërnyy Podˮyëmnyy Gromadʼyë Yyënchëping Ostriyë Yëlkin Sayylyk',
'Екатеринбург Екатеринбург Юрьев Съезд Чапаевск Белкин Озёрный Подъёмный Громадьё Йёнчёпинг Остриё Ёлкин Сайылык',
true
)

);
}

Expand Down