-
Notifications
You must be signed in to change notification settings - Fork 162
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
gccrs: Add function
Rust::encode_punycode
gcc/rust/ChangeLog: * Make-lang.in: Add rust-punycode.o. * rust-lang.cc (run_rust_tests): Add selftest. * util/rust-punycode.cc: New file. * util/rust-punycode.h: New file. Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
- Loading branch information
1 parent
67d1f4a
commit ebd449c
Showing
4 changed files
with
229 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
// Copyright (C) 2020-2023 Free Software Foundation, Inc. | ||
|
||
// This file is part of GCC. | ||
|
||
// GCC is free software; you can redistribute it and/or modify it under | ||
// the terms of the GNU General Public License as published by the Free | ||
// Software Foundation; either version 3, or (at your option) any later | ||
// version. | ||
|
||
// GCC is distributed in the hope that it will be useful, but WITHOUT ANY | ||
// WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | ||
// for more details. | ||
|
||
// You should have received a copy of the GNU General Public License | ||
// along with GCC; see the file COPYING3. If not see | ||
// <http://www.gnu.org/licenses/>. | ||
|
||
// This file provides functions for punycode conversion | ||
// See https://datatracker.ietf.org/doc/html/rfc3492 | ||
|
||
#include "rust-system.h" | ||
#include "rust-unicode.h" | ||
#include "optional.h" | ||
#include "selftest.h" | ||
|
||
namespace Rust { | ||
|
||
// https://tools.ietf.org/html/rfc3492#section-4. | ||
constexpr uint32_t BASE = 36; | ||
constexpr uint32_t TMIN = 1; | ||
constexpr uint32_t TMAX = 26; | ||
constexpr uint32_t SKEW = 38; | ||
constexpr uint32_t DAMP = 700; | ||
constexpr uint32_t INITIAL_BIAS = 72; | ||
constexpr uint32_t INITIAL_N = 128; | ||
constexpr char DELIMITER = '-'; | ||
|
||
constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F; | ||
|
||
std::string | ||
extract_basic_string (const std::vector<Codepoint> &src) | ||
{ | ||
std::string basic_string; | ||
for (auto c : src) | ||
{ | ||
if (c.value <= MAX_ASCII_CODEPOINT) | ||
basic_string += c.as_string (); | ||
} | ||
return basic_string; | ||
} | ||
|
||
uint32_t | ||
adapt_bias (uint32_t delta, const uint32_t n_points, const bool is_first) | ||
{ | ||
delta /= is_first ? DAMP : 2; | ||
delta += delta / n_points; | ||
uint32_t k = 0; | ||
|
||
while (delta > (BASE - TMIN) * TMAX / 2) | ||
{ | ||
delta /= BASE - TMIN; | ||
k += BASE; | ||
} | ||
return k + (BASE - TMIN + 1) * delta / (delta + SKEW); | ||
} | ||
|
||
uint32_t | ||
clamped_sub (const uint32_t min, const uint32_t lhs, const uint32_t rhs, | ||
const uint32_t max) | ||
{ | ||
if (min + rhs >= lhs) | ||
return min; | ||
else if (max + rhs <= lhs) | ||
return max; | ||
else | ||
return lhs - rhs; | ||
} | ||
|
||
uint32_t | ||
min_gt_or_eq (const std::vector<Codepoint> &l, const uint32_t threshold) | ||
{ | ||
uint32_t min = UINT32_MAX; | ||
for (auto c : l) | ||
if (c.value >= threshold && c.value < min) | ||
min = c.value; | ||
return min; | ||
} | ||
|
||
char | ||
encode_digit (const uint32_t d) | ||
{ | ||
return d + 22 + (d < 26 ? 75 : 0); | ||
} | ||
|
||
tl::optional<std::string> | ||
encode_punycode (const Utf8String &input) | ||
{ | ||
std::vector<Codepoint> input_chars = input.get_chars (); | ||
|
||
uint32_t n = INITIAL_N; | ||
uint32_t delta = 0; | ||
uint32_t bias = INITIAL_BIAS; | ||
|
||
std::string output = extract_basic_string (input_chars); | ||
uint32_t h = output.size (); | ||
const uint32_t b = h; | ||
if (b > 0) | ||
output += DELIMITER; | ||
|
||
while (h < input_chars.size ()) | ||
{ | ||
const uint32_t m = min_gt_or_eq (input_chars, n); | ||
|
||
if (m - n > ((UINT32_MAX - delta) / (h + 1))) | ||
return tl::nullopt; | ||
|
||
delta += (m - n) * (h + 1); | ||
n = m; | ||
|
||
for (const auto c : input_chars) | ||
{ | ||
if (c.value < n) | ||
delta++; | ||
else if (c.value == n) | ||
{ | ||
uint32_t q = delta; | ||
// encode as a variable length integer | ||
for (uint32_t k = 1;; k++) | ||
{ | ||
const uint32_t kb = k * BASE; | ||
const uint32_t t = clamped_sub (TMIN, kb, bias, TMAX); | ||
if (q < t) | ||
break; | ||
|
||
output += encode_digit (t + (q - t) % (BASE - t)); | ||
q = (q - t) / (BASE - t); | ||
} | ||
output += encode_digit (q); | ||
|
||
bias = adapt_bias (delta, h + 1, h == b); | ||
delta = 0; | ||
h++; | ||
} | ||
} | ||
delta++; | ||
n++; | ||
} | ||
|
||
return {output}; | ||
} | ||
|
||
} // namespace Rust | ||
|
||
namespace selftest { | ||
|
||
void | ||
encode_assert (const std::string &input, const std::string &expected) | ||
{ | ||
Rust::Utf8String input_utf8 | ||
= Rust::Utf8String::make_utf8_string (input).value (); | ||
std::string actual = Rust::encode_punycode (input_utf8).value (); | ||
ASSERT_EQ (actual, expected); | ||
} | ||
|
||
void | ||
rust_punycode_encode_test () | ||
{ | ||
encode_assert ("abc", "abc-"); | ||
encode_assert ("12345", "12345-"); | ||
encode_assert ("香港", "j6w193g"); | ||
|
||
// Examples from https://datatracker.ietf.org/doc/html/rfc3492#section-7.1 | ||
encode_assert ("ليهمابتكلموشعربي؟", "egbpdaj6bu4bxfgehfvwxn"); | ||
encode_assert ("他们为什么不说中文", "ihqwcrb4cv8a8dqg056pqjye"); | ||
encode_assert ("他們爲什麽不說中文", "ihqwctvzc91f659drss3x8bo0yb"); | ||
encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a"); | ||
} | ||
|
||
} // namespace selftest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
// Copyright (C) 2020-2023 Free Software Foundation, Inc. | ||
|
||
// This file is part of GCC. | ||
|
||
// GCC is free software; you can redistribute it and/or modify it under | ||
// the terms of the GNU General Public License as published by the Free | ||
// Software Foundation; either version 3, or (at your option) any later | ||
// version. | ||
|
||
// GCC is distributed in the hope that it will be useful, but WITHOUT ANY | ||
// WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | ||
// for more details. | ||
|
||
// You should have received a copy of the GNU General Public License | ||
// along with GCC; see the file COPYING3. If not see | ||
// <http://www.gnu.org/licenses/>. | ||
|
||
#ifndef RUST_PUNYCODE_H | ||
#define RUST_PUNYCODE_H | ||
|
||
#include "rust-unicode.h" | ||
#include "optional.h" | ||
|
||
namespace Rust { | ||
|
||
/* Encode a string as punycode. Returns a string if encoding is successful. | ||
* Returns nullopt otherwise. Note that a returned string contains only ASCII | ||
* characters and does not start with `xn--`. */ | ||
tl::optional<std::string> | ||
encode_punycode (const Utf8String &src); | ||
|
||
} // namespace Rust | ||
|
||
#if CHECKING_P | ||
|
||
namespace selftest { | ||
|
||
void | ||
rust_punycode_encode_test (); | ||
|
||
} // namespace selftest | ||
|
||
#endif // CHECKING_P | ||
|
||
#endif |