Skip to content

Commit

Permalink
Support for Transliteration (bchavez#233)
Browse files Browse the repository at this point in the history
Fixes bchavez#225. Transliteration using Trie data-structure. ❤️
  • Loading branch information
bchavez authored Jul 2, 2019
1 parent bd5bd4a commit ef9d73c
Show file tree
Hide file tree
Showing 16 changed files with 1,681 additions and 22 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
[submodule "Source/hashids"]
path = Source/hashids
url = https://github.com/ullmark/hashids.net.git
[submodule "Source/speakingurl"]
path = Source/speakingurl
url = https://github.com/pid/speakingurl.git
8 changes: 7 additions & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
## v27.0.2
## v28.0.1
Release Date: TBA

* BREAKING: Deterministic sequence values may have changed for fake email addresses derived from `Internet.Email()` or `Internet.UserName()` in locales other than `en`.
* Issue 229: Adds `Finance.Iban(countryCode)` ISO3166 country code parameter. Allows generating IBAN codes for specific countries. The country code must be a supported otherwise an exception is thrown.
* Issue 225: Better support for transliteration of international Unicode characters to US-Latin/Roman ASCII character sets. `Internet.Email()` and `Internet.UserName()` are more respectful of specified locale using character transliteration.
* Added `.Transliterate()` string extension method in `Bogus.Extensions` namespace.
* Added `Internet.UserNameUnicode()` that preserves Unicode characters in user names.
* Minor performance improvement to `Utils.Slugify` using compiled regex.


## v27.0.1
Release Date: 2019-05-02
Expand Down
30 changes: 30 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,33 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

================================================================================

The BSD 3-Clause License (BSD3)

Copyright (c) 2013-2017 Sascha Droste <pid@posteo.net>
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
* Neither the name of the author nor the names of its contributors may be used
to endorse or promote products derived from this software without specific prior
written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 changes: 11 additions & 1 deletion Source/Bogus.Tests/DataSetTests/InternetTests.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using System.Linq;
using System.Linq;
using Bogus.DataSets;
using FluentAssertions;
using Xunit;
Expand Down Expand Up @@ -150,5 +150,15 @@ public void can_pick_random_browser()
i => internet.UserAgent())
.Dump();
}

[Theory]
[InlineData("Анна", "Фомина", "Анна11", 1337)]
[InlineData("Анна", "Фомина", "Анна_Фомина13", 228)]
[InlineData("Анна", "Фомина", "Анна.Фомина", 302)]
public void can_get_username_with_unicode_characters(string first, string last, string expected, int seed)
{
internet.Random = new Randomizer(seed);
internet.UserNameUnicode(first, last).Should().Be(expected);
}
}
}
38 changes: 38 additions & 0 deletions Source/Bogus.Tests/GitHubIssues/Issue225.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
using Bogus.DataSets;
using Bogus.Extensions;
using FluentAssertions;
using Xunit;

namespace Bogus.Tests.GitHubIssues
{
public class Issue225 : SeededTest
{
[Fact]
public void can_generate_sane_email_addresses_in_different_locales()
{
var p = new Bogus.Person("ru");
p.FullName.Should().Be("Анастасия Евсеева");
p.Email.Should().Be("Anastasiya69@gmail.com");
}

[Fact]
public void can_generate_sane_email_address_from_ru()
{
var i = new Internet();
i.Email("Анна", "Фомина").Should().Be("Anna81@yahoo.com");
}

[Fact]
public void can_generate_email_without_transliteration()
{
var i = new Internet();
i.Email("Анна", "Фомина").Should().Be("Anna81@yahoo.com");
}

[Fact]
public void simple_translation()
{
"Анна Фомина".Transliterate().Should().Be("Anna Fomina");
}
}
}
4 changes: 2 additions & 2 deletions Source/Bogus.Tests/GitHubIssues/Issue86.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ public void should_remove_diacritic_marks()
[Fact]
public void should_remove_diacritic_marks_in_email()
{
internet.Email("ßra'inÄÖÜí", "ÄÖÜíchavez").Should().Be("rainAOUi81@yahoo.com");
internet.Email("ßra'inÄÖÜí", "ÄÖÜíchavez").Should().Be("ssrainAeOeUei81@yahoo.com");
}

[Fact]
public void should_remove_diacritic_marks_in_username()
{
internet.UserName("ßri'ÄÖÜían", "chaÄÖÜíez").Should().Be("riAOUian.chaAOUiez");
internet.UserName("ßri'ÄÖÜían", "chaÄÖÜíez").Should().Be("ssriAeOeUeian.chaAeOeUeiez");
}
}
}
68 changes: 68 additions & 0 deletions Source/Bogus.Tests/TransliterateTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
using System;
using FluentAssertions;
using Xunit;

namespace Bogus.Tests
{
public class TransliterateTests
{
[Fact]
public void Test()
{
Transliterater.Translate("À").Should().Be("A");
Transliterater.Translate("ден").Should().Be("MKD");
Transliterater.Translate("စျ").Should().Be("za");
}

[Fact]
public void index_test()
{
Transliterater.Translate("ေါင်ူ").Should().Be("aungu");
}

[Fact]
public void simple_test()
{
Transliterater.Translate("ာ").Should().Be("a");
}

[Fact]
public void basic_ru_test()
{
Transliterater.Translate("Анна Фомина").Should().Be("Anna Fomina");
}

[Fact]
public void index2_test()
{
Transliterater.Translate("ေါင်ff").Should().Be("aungff");
}

[Fact]
public void transliterate_with_unknown_langauge_doesnt_throw()
{
Action a = () => Transliterater.Translate("fefefe", "gggg");
a.ShouldNotThrow();
}

[Fact]
public void can_translate_symbol()
{
Transliterater.Translate("♥").Should().Be("love");
}

[Fact]
public void can_translate_symbol_with_locale()
{
Transliterater.Translate("♥", "es").Should().Be("amor");
}

[Fact]
public void can_translate_with_langchar_map()
{
Transliterater.Translate("Ä").Should().Be("Ae");
Transliterater.Translate("Ä", lang: "fi").Should().Be("A");
Transliterater.Translate("Ä", lang: "hu").Should().Be("A");
}
}
}
34 changes: 25 additions & 9 deletions Source/Bogus/DataSets/Internet.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System;
using System.Linq;
using System.Text.RegularExpressions;
using Bogus.Extensions;
using Bogus.Vendor;

namespace Bogus.DataSets
Expand Down Expand Up @@ -52,7 +53,7 @@ public string Email(string firstName = null, string lastName = null, string prov
{
provider = provider ?? GetRandomArrayItem("free_email");

return Utils.Slugify(UserName(firstName, lastName)) + uniqueSuffix + "@" + provider;
return UserName(firstName, lastName) + uniqueSuffix + "@" + provider;
}

/// <summary>
Expand All @@ -70,34 +71,49 @@ public string ExampleEmail(string firstName = null, string lastName = null)
/// <summary>
/// Generates user names.
/// </summary>
/// <param name="firstName">Always used.</param>
/// <param name="lastName">Sometimes used depending on randomness.</param>
/// <param name="firstName">First name is always part of the returned user name.</param>
/// <param name="lastName">Last name may or may not be used.</param>
/// <returns>A random user name.</returns>
public string UserName(string firstName = null, string lastName = null)
{
firstName = firstName ?? Name.FirstName();
lastName = lastName ?? Name.LastName();

firstName = firstName.Transliterate(this.Locale);
lastName = lastName.Transliterate(this.Locale);

return Utils.Slugify(UserNameUnicode(firstName, lastName));
}

/// <summary>
/// Generates a user name preserving Unicode characters.
/// </summary>
/// <param name="firstName">First name is always part of the returned user name.</param>
/// <param name="lastName">Last name may or may not be used.</param>
public string UserNameUnicode(string firstName = null, string lastName = null)
{
firstName = firstName ?? Name.FirstName();
lastName = lastName ?? Name.LastName();

var val = Random.Number(2);

string result;

if( val == 0 )
if (val == 0)
{
result = firstName + Random.Number(99);
}
else if( val == 1 )
else if (val == 1)
{
result = firstName + Random.ArrayElement(new[] {".", "_"}) + lastName;
result = firstName + Random.ArrayElement(new[] { ".", "_" }) + lastName;
}
else
{
result = firstName + Random.ArrayElement(new[] {".", "_"}) + lastName + Random.Number(99);
result = firstName + Random.ArrayElement(new[] { ".", "_" }) + lastName + Random.Number(99);
}

result = result.Replace(" ", string.Empty);

return Utils.Slugify(result);
return result;
}

/// <summary>
Expand Down
12 changes: 11 additions & 1 deletion Source/Bogus/Extensions/ExtensionsForString.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using System;
using System;
using System.Globalization;
using System.Text;

Expand Down Expand Up @@ -51,5 +51,15 @@ public static string RemoveDiacritics(this string @this)

return sb.ToString().Normalize(NormalizationForm.FormC);
}

/// <summary>
/// Transliterates Unicode characters to US-ASCII. For example, Russian cryllic "Анна Фомина" becomes "Anna Fomina".
/// </summary>
/// <param name="this">The @this string to act on.</param>
/// <param name="lang">The language character set to use.</param>
public static string Transliterate(this string @this, string lang = "en")
{
return Transliterater.Translate(@this, lang);
}
}
}
Loading

0 comments on commit ef9d73c

Please sign in to comment.