blob: 89476f2cc82fac7ade231f5ed827ad5cd60c1584 [file] [log] [blame]
// Copyright (C) 2020-2024 Free Software Foundation, Inc.
// This file is part of GCC.
// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.
// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// <http://www.gnu.org/licenses/>.
// This file provides functions for punycode conversion
// See https://datatracker.ietf.org/doc/html/rfc3492
#include "rust-system.h"
#include "rust-unicode.h"
#include "optional.h"
#include "selftest.h"
namespace Rust {
// https://tools.ietf.org/html/rfc3492#section-4.
constexpr uint32_t BASE = 36;
constexpr uint32_t TMIN = 1;
constexpr uint32_t TMAX = 26;
constexpr uint32_t SKEW = 38;
constexpr uint32_t DAMP = 700;
constexpr uint32_t INITIAL_BIAS = 72;
constexpr uint32_t INITIAL_N = 128;
constexpr char DELIMITER = '-';
std::string
extract_basic_string (const std::vector<Codepoint> &src)
{
std::string basic_string;
for (auto c : src)
{
if (c.is_ascii ())
basic_string += c.as_string ();
}
return basic_string;
}
uint32_t
adapt_bias (uint32_t delta, const uint32_t n_points, const bool is_first)
{
delta /= is_first ? DAMP : 2;
delta += delta / n_points;
uint32_t k = 0;
while (delta > (BASE - TMIN) * TMAX / 2)
{
delta /= BASE - TMIN;
k += BASE;
}
return k + (BASE - TMIN + 1) * delta / (delta + SKEW);
}
uint32_t
clamped_sub (const uint32_t min, const uint32_t lhs, const uint32_t rhs,
const uint32_t max)
{
if (min + rhs >= lhs)
return min;
else if (max + rhs <= lhs)
return max;
else
return lhs - rhs;
}
uint32_t
min_gt_or_eq (const std::vector<Codepoint> &l, const uint32_t threshold)
{
uint32_t min = UINT32_MAX;
for (auto c : l)
if (c.value >= threshold && c.value < min)
min = c.value;
return min;
}
char
encode_digit (const uint32_t d)
{
return d + 22 + (d < 26 ? 75 : 0);
}
tl::optional<std::string>
encode_punycode (const Utf8String &input)
{
std::vector<Codepoint> input_chars = input.get_chars ();
uint32_t n = INITIAL_N;
uint32_t delta = 0;
uint32_t bias = INITIAL_BIAS;
std::string output = extract_basic_string (input_chars);
uint32_t h = output.size ();
const uint32_t b = h;
if (b > 0)
output += DELIMITER;
while (h < input_chars.size ())
{
const uint32_t m = min_gt_or_eq (input_chars, n);
if (m - n > ((UINT32_MAX - delta) / (h + 1)))
return tl::nullopt;
delta += (m - n) * (h + 1);
n = m;
for (const auto c : input_chars)
{
if (c.value < n)
delta++;
else if (c.value == n)
{
uint32_t q = delta;
// encode as a variable length integer
for (uint32_t k = 1;; k++)
{
const uint32_t kb = k * BASE;
const uint32_t t = clamped_sub (TMIN, kb, bias, TMAX);
if (q < t)
break;
output += encode_digit (t + (q - t) % (BASE - t));
q = (q - t) / (BASE - t);
}
output += encode_digit (q);
bias = adapt_bias (delta, h + 1, h == b);
delta = 0;
h++;
}
}
delta++;
n++;
}
return {output};
}
} // namespace Rust
#if CHECKING_P
namespace selftest {
void
encode_assert (const std::string &input, const std::string &expected)
{
Rust::Utf8String input_utf8
= Rust::Utf8String::make_utf8_string (input).value ();
std::string actual = Rust::encode_punycode (input_utf8).value ();
ASSERT_EQ (actual, expected);
}
void
rust_punycode_encode_test ()
{
encode_assert ("abc", "abc-");
encode_assert ("12345", "12345-");
encode_assert ("香港", "j6w193g");
// Examples from https://datatracker.ietf.org/doc/html/rfc3492#section-7.1
encode_assert ("ليهمابتكلموشعربي؟", "egbpdaj6bu4bxfgehfvwxn");
encode_assert ("他们为什么不说中文", "ihqwcrb4cv8a8dqg056pqjye");
encode_assert ("他們爲什麽不說中文", "ihqwctvzc91f659drss3x8bo0yb");
encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a");
}
} // namespace selftest
#endif // CHECKING_P