Skip to content

Commit

Permalink
gccrs: Add function Rust::encode_punycode
Browse files Browse the repository at this point in the history
gcc/rust/ChangeLog:

	* Make-lang.in: Add rust-punycode.o.
	* rust-lang.cc (run_rust_tests): Add selftest.
	* util/rust-punycode.cc: New file.
	* util/rust-punycode.h: New file.

Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
  • Loading branch information
tamaroning authored and P-E-P committed Aug 9, 2023
1 parent 67d1f4a commit ebd449c
Show file tree
Hide file tree
Showing 4 changed files with 229 additions and 0 deletions.
1 change: 1 addition & 0 deletions gcc/rust/Make-lang.in
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ GRS_OBJS = \
rust/rust-feature-gate.o \
rust/rust-dir-owner.o \
rust/rust-unicode.o \
rust/rust-punycode.o \
$(END)
# removed object files from here

Expand Down
2 changes: 2 additions & 0 deletions gcc/rust/rust-lang.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "rust-lex.h"
#include "optional.h"
#include "rust-unicode.h"
#include "rust-punycode.h"

#include <mpfr.h>
// note: header files must be in this order or else forward declarations don't
Expand Down Expand Up @@ -456,6 +457,7 @@ run_rust_tests ()
// Call tests for the rust frontend here
rust_input_source_test ();
rust_utf8_normalize_test ();
rust_punycode_encode_test ();
rust_cfg_parser_test ();
rust_privacy_ctx_test ();
rust_crate_name_validation_test ();
Expand Down
180 changes: 180 additions & 0 deletions gcc/rust/util/rust-punycode.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
// Copyright (C) 2020-2023 Free Software Foundation, Inc.

// This file is part of GCC.

// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.

// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.

// This file provides functions for punycode conversion
// See https://datatracker.ietf.org/doc/html/rfc3492

#include "rust-system.h"
#include "rust-unicode.h"
#include "optional.h"
#include "selftest.h"

namespace Rust {

// https://tools.ietf.org/html/rfc3492#section-4.
constexpr uint32_t BASE = 36;
constexpr uint32_t TMIN = 1;
constexpr uint32_t TMAX = 26;
constexpr uint32_t SKEW = 38;
constexpr uint32_t DAMP = 700;
constexpr uint32_t INITIAL_BIAS = 72;
constexpr uint32_t INITIAL_N = 128;
constexpr char DELIMITER = '-';

constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;

std::string
extract_basic_string (const std::vector<Codepoint> &src)
{
std::string basic_string;
for (auto c : src)
{
if (c.value <= MAX_ASCII_CODEPOINT)
basic_string += c.as_string ();
}
return basic_string;
}

uint32_t
adapt_bias (uint32_t delta, const uint32_t n_points, const bool is_first)
{
delta /= is_first ? DAMP : 2;
delta += delta / n_points;
uint32_t k = 0;

while (delta > (BASE - TMIN) * TMAX / 2)
{
delta /= BASE - TMIN;
k += BASE;
}
return k + (BASE - TMIN + 1) * delta / (delta + SKEW);
}

uint32_t
clamped_sub (const uint32_t min, const uint32_t lhs, const uint32_t rhs,
const uint32_t max)
{
if (min + rhs >= lhs)
return min;
else if (max + rhs <= lhs)
return max;
else
return lhs - rhs;
}

uint32_t
min_gt_or_eq (const std::vector<Codepoint> &l, const uint32_t threshold)
{
uint32_t min = UINT32_MAX;
for (auto c : l)
if (c.value >= threshold && c.value < min)
min = c.value;
return min;
}

char
encode_digit (const uint32_t d)
{
return d + 22 + (d < 26 ? 75 : 0);
}

tl::optional<std::string>
encode_punycode (const Utf8String &input)
{
std::vector<Codepoint> input_chars = input.get_chars ();

uint32_t n = INITIAL_N;
uint32_t delta = 0;
uint32_t bias = INITIAL_BIAS;

std::string output = extract_basic_string (input_chars);
uint32_t h = output.size ();
const uint32_t b = h;
if (b > 0)
output += DELIMITER;

while (h < input_chars.size ())
{
const uint32_t m = min_gt_or_eq (input_chars, n);

if (m - n > ((UINT32_MAX - delta) / (h + 1)))
return tl::nullopt;

delta += (m - n) * (h + 1);
n = m;

for (const auto c : input_chars)
{
if (c.value < n)
delta++;
else if (c.value == n)
{
uint32_t q = delta;
// encode as a variable length integer
for (uint32_t k = 1;; k++)
{
const uint32_t kb = k * BASE;
const uint32_t t = clamped_sub (TMIN, kb, bias, TMAX);
if (q < t)
break;

output += encode_digit (t + (q - t) % (BASE - t));
q = (q - t) / (BASE - t);
}
output += encode_digit (q);

bias = adapt_bias (delta, h + 1, h == b);
delta = 0;
h++;
}
}
delta++;
n++;
}

return {output};
}

} // namespace Rust

namespace selftest {

void
encode_assert (const std::string &input, const std::string &expected)
{
Rust::Utf8String input_utf8
= Rust::Utf8String::make_utf8_string (input).value ();
std::string actual = Rust::encode_punycode (input_utf8).value ();
ASSERT_EQ (actual, expected);
}

void
rust_punycode_encode_test ()
{
encode_assert ("abc", "abc-");
encode_assert ("12345", "12345-");
encode_assert ("香港", "j6w193g");

// Examples from https://datatracker.ietf.org/doc/html/rfc3492#section-7.1
encode_assert ("ليهمابتكلموشعربي؟", "egbpdaj6bu4bxfgehfvwxn");
encode_assert ("他们为什么不说中文", "ihqwcrb4cv8a8dqg056pqjye");
encode_assert ("他們爲什麽不說中文", "ihqwctvzc91f659drss3x8bo0yb");
encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a");
}

} // namespace selftest
46 changes: 46 additions & 0 deletions gcc/rust/util/rust-punycode.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright (C) 2020-2023 Free Software Foundation, Inc.

// This file is part of GCC.

// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.

// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.

#ifndef RUST_PUNYCODE_H
#define RUST_PUNYCODE_H

#include "rust-unicode.h"
#include "optional.h"

namespace Rust {

/* Encode a string as punycode. Returns a string if encoding is successful.
* Returns nullopt otherwise. Note that a returned string contains only ASCII
* characters and does not start with `xn--`. */
tl::optional<std::string>
encode_punycode (const Utf8String &src);

} // namespace Rust

#if CHECKING_P

namespace selftest {

void
rust_punycode_encode_test ();

} // namespace selftest

#endif // CHECKING_P

#endif

0 comments on commit ebd449c

Please sign in to comment.