gcc/rust/util/rust-unicode.cc - gcc.git - Git at Google

 // Copyright (C) 2020-2025 Free Software Foundation, Inc.

 // This file is part of GCC.

 // GCC is free software; you can redistribute it and/or modify it under
 // the terms of the GNU General Public License as published by the Free
 // Software Foundation; either version 3, or (at your option) any later
 // version.

 // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 // for more details.

 // You should have received a copy of the GNU General Public License
 // along with GCC; see the file COPYING3.  If not see
 // <http://www.gnu.org/licenses/>.

 #include "rust-input-source.h"
 #include "rust-system.h"
 #include "optional.h"
 #include "selftest.h"
 #include "rust-lex.h"
 #include "rust-unicode.h"

 #include "rust-unicode-data.h"

 namespace Rust {

 typedef Codepoint codepoint_t;
 typedef std::vector<codepoint_t> string_t;

 // These constants are used to compose and decompose of Hangul syllables.
 // See `Sample Code for Hangul Algorithms` in 3.1.2
 // unicode.org/versions/Unicode15.0.0/ch03.pdf
 const uint32_t S_BASE = 0xAC00;
 const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7;
 const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
 const uint32_t N_COUNT = V_COUNT * T_COUNT;
 const uint32_t S_COUNT = L_COUNT * N_COUNT;

 // Check if the codepoint is in any of the ranges (half-open intervals [a,b)).
 template <std::size_t SIZE>
 bool
 binary_search_ranges (
   const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
   uint32_t target_cp)
 {
   auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp,
 			      [] (const std::pair<uint32_t, uint32_t> &a,
 				  uint32_t b) { return a.second <= b; });
   if (it == ranges.end ())
     return false;
   else
     return it->first <= target_cp && target_cp < it->second;
 }

 int
 lookup_cc (codepoint_t c)
 {
   auto it = CCC_TABLE.find (c.value);
   if (it != CCC_TABLE.end ())
     return it->second;
   else
     // Starter. Returns zero.
     return 0;
 }

 tl::optional<codepoint_t>
 lookup_recomp (codepoint_t starter, codepoint_t c)
 {
   auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value});
   if (it != Rust::RECOMPOSITION_MAP.end ())
     return {it->second};

   it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0});
   if (it != Rust::RECOMPOSITION_MAP.end ())
     return {it->second};

   return tl::nullopt;
 }

 void
 recursive_decomp_cano (codepoint_t c, string_t &buf)
 {
   auto it = Rust::DECOMPOSITION_MAP.find (c.value);
   if (it != Rust::DECOMPOSITION_MAP.end ())
     {
       std::vector<uint32_t> decomped = it->second;
       for (uint32_t cp : decomped)
 	recursive_decomp_cano (cp, buf);
     }
   else
     buf.push_back (c);
 }

 string_t
 decomp_cano (string_t s)
 {
   string_t buf;
   for (codepoint_t c : s)
     {
       int64_t s_index = c.value - S_BASE;
       if (0 <= s_index && s_index < S_COUNT)
 	{
 	  // decompose Hangul argorithmically
 	  uint32_t l = L_BASE + s_index / N_COUNT;
 	  uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
 	  uint32_t t = T_BASE + s_index % T_COUNT;
 	  buf.push_back (l);
 	  buf.push_back (v);
 	  if (t != T_BASE)
 	    buf.push_back (t);
 	  continue;
 	}

       // Current character is not hangul
       recursive_decomp_cano (c, buf);
     }
   return buf;
 }

 void
 sort_cano (string_t &s)
 {
   int cc_here, cc_prev;
   if (s.size () == 1)
     return;
   for (unsigned int i = 1; i < s.size (); i++)
     {
       cc_here = lookup_cc (s[i]);
       cc_prev = lookup_cc (s[i - 1]);
       if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
 	{
 	  // swap
 	  codepoint_t tmp = s[i];
 	  s[i] = s[i - 1];
 	  s[i - 1] = tmp;
 	  if (i > 1)
 	    i -= 2;
 	}
     }
 }

 string_t
 compose_hangul (string_t s)
 {
   string_t buf;
   if (s.size () < 2)
     return s;

   codepoint_t last = s[0];
   buf.push_back (last);
   for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
     {
       codepoint_t ch = s[src_pos];

       // L V => LV
       int64_t l_index = last.value - L_BASE;
       if (0 <= l_index && l_index < L_COUNT)
 	{
 	  int64_t v_index = ch.value - V_BASE;
 	  if (0 <= v_index && v_index < V_COUNT)
 	    {
 	      last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
 	      // pop L
 	      buf.pop_back ();
 	      buf.push_back (last);
 	      continue;
 	    }
 	}

       // LV T => LVT
       int64_t s_index = last.value - S_BASE;
       if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
 	{
 	  int64_t t_index = ch.value - T_BASE;
 	  if (0 < t_index && t_index < T_COUNT)
 	    {
 	      last.value += t_index;
 	      // pop LV
 	      buf.pop_back ();
 	      buf.push_back (last);
 	      continue;
 	    }
 	}
       last = ch;
       buf.push_back (last);
     }
   return buf;
 }

 string_t
 recomp (string_t s)
 {
   // compose hangul first
   s = compose_hangul (s);

   string_t buf;
   if (s.size () < 2)
     return s;

   int last_class = -1;
   // int starter_pos = 0; // Assume the first character is Starter. Correct?
   // int target_pos = 1;
   codepoint_t starter_ch = s[0];

   for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
     {
       // get current character
       codepoint_t ch = s[src_pos];

       int ch_class = lookup_cc (ch);
       tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
       if (composite.has_value () && last_class < ch_class)
 	{
 	  // ch can be composed
 	  buf.push_back (composite.value ());
 	  starter_ch = composite.value ();
 	}
       else if (ch_class == 0)
 	{
 	  // ch is Starter and cannot be composed.
 	  if (src_pos == 1)
 	    // FIXME: buggy?
 	    buf.push_back (starter_ch);
 	  starter_ch = ch;
 	  last_class = -1;
 	  buf.push_back (ch);
 	}
       else
 	{
 	  if (src_pos == 1)
 	    // FIXME: buggy?
 	    buf.push_back (starter_ch);
 	  // ch is not Starter.
 	  last_class = ch_class;
 	  buf.push_back (ch);
 	}
     }
   return buf;
 }

 // see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
 QuickCheckResult
 nfc_quick_check (const string_t &s)
 {
   int last_canonical_class = 0;
   QuickCheckResult res = QuickCheckResult::YES;

   for (unsigned long i = 0; i < s.size (); i++)
     {
       codepoint_t c = s[i];

       if (c.is_supplementary_character ())
 	i++;

       int canonical_class = lookup_cc (c);
       if (last_canonical_class > canonical_class && canonical_class != 0)
 	return QuickCheckResult::NO;

       if (is_nfc_qc_no (c.value))
 	return QuickCheckResult::NO;

       if (is_nfc_qc_maybe (c.value))
 	res = QuickCheckResult::MAYBE;

       last_canonical_class = canonical_class;
     }
   return res;
 }

 string_t
 nfc_normalize (const string_t &s)
 {
   if (nfc_quick_check (s) == QuickCheckResult::YES)
     return s;

   // TODO: optimize normalization.
   // i.e. only normalize a limited area around MAYBE character, instead of
   // performing complete normlization of the entire string

   // decompose
   string_t d = decomp_cano (s);
   sort_cano (d);

   // recompose
   string_t r = recomp (d);
   return r;
 }

 Utf8String
 Utf8String::nfc_normalize () const
 {
   return Utf8String (Rust::nfc_normalize (chars));
 }

 bool
 is_alphabetic (uint32_t codepoint)
 {
   return binary_search_ranges (ALPHABETIC_RANGES, codepoint);
 }

 bool
 is_numeric (uint32_t codepoint)
 {
   return std::binary_search (NUMERIC_CODEPOINTS.begin (),
 			     NUMERIC_CODEPOINTS.end (), codepoint);
 }

 bool
 is_nfc_qc_maybe (uint32_t codepoint)
 {
   return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint);
 }

 bool
 is_nfc_qc_no (uint32_t codepoint)
 {
   return binary_search_ranges (NFC_QC_NO_RANGES, codepoint);
 }

 bool
 is_ascii_only (const std::string &str)
 {
   for (char c : str)
     if (static_cast<uint32_t> (c) > MAX_ASCII_CODEPOINT)
       return false;
   return true;
 }

 } // namespace Rust

 #if CHECKING_P

 namespace selftest {

 void
 rust_nfc_qc_test ()
 {
   ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}),
 	     Rust::QuickCheckResult::YES);
   ASSERT_EQ (Rust::nfc_quick_check (
 	       {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}),
 	     Rust::QuickCheckResult::MAYBE);
   ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}),
 	     Rust::QuickCheckResult::NO);
 }

 void
 assert_normalize (const std::vector<Rust::Codepoint> origin,
 		  const std::vector<Rust::Codepoint> expected)
 {
   std::vector<Rust::Codepoint> actual = Rust::nfc_normalize (origin);

   ASSERT_EQ (actual.size (), expected.size ());
   for (unsigned int i = 0; i < actual.size (); i++)
     {
       ASSERT_EQ (actual[i], expected[i]);
     }
 }

 void
 rust_utf8_normalize_test ()
 {
   // ASCII
   assert_normalize ({'h', 'e', 'l', 'l', 'o'}, {'h', 'e', 'l', 'l', 'o'});
   // ASCII
   assert_normalize ({'/', '\\', '.', ':', '*'}, {'/', '\\', '.', ':', '*'});

   // testcases retrieved from Part0 of
   // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
   assert_normalize ({0x1e0a}, {0x1e0a});
   assert_normalize ({0x1e0c}, {0x1e0c});
   assert_normalize ({0x1e0a, 0x0323}, {0x1e0c, 0x0307});
   assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
   assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});

   // testcases for Hangul from Part0
   assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01});
   assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8});
   // testcases for Hangul from Part1
   assert_normalize ({0x3131}, {0x3131});
   assert_normalize ({0x3132}, {0x3132});
   // testcases for Hangul from Part3
   assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161});
   assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae});

   // TODO: add more testcases in
   // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
 }

 void
 rust_utf8_property_test ()
 {
   ASSERT_TRUE (Rust::is_alphabetic ('A'));
   ASSERT_TRUE (Rust::is_alphabetic ('B'));
   ASSERT_TRUE (Rust::is_alphabetic ('x'));
   ASSERT_TRUE (Rust::is_alphabetic ('z'));
   ASSERT_TRUE (Rust::is_alphabetic (0x00b5));  // µ
   ASSERT_TRUE (Rust::is_alphabetic (0x3093));  // ん
   ASSERT_TRUE (Rust::is_alphabetic (0xa8f2));  // ꣲ
   ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃

   ASSERT_FALSE (Rust::is_alphabetic ('\v'));
   ASSERT_FALSE (Rust::is_alphabetic ('-'));
   ASSERT_FALSE (Rust::is_alphabetic ('_'));
   ASSERT_FALSE (Rust::is_alphabetic ('+'));
   ASSERT_FALSE (Rust::is_alphabetic ('0'));
   ASSERT_FALSE (Rust::is_alphabetic ('1'));
   ASSERT_FALSE (Rust::is_alphabetic ('2'));
   ASSERT_FALSE (Rust::is_alphabetic ('9'));
   ASSERT_FALSE (Rust::is_alphabetic (0xa720)); // ◌
   ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁

   // `Nd`s
   ASSERT_TRUE (Rust::is_numeric ('0'));
   ASSERT_TRUE (Rust::is_numeric ('1'));
   ASSERT_TRUE (Rust::is_numeric ('7'));
   ASSERT_TRUE (Rust::is_numeric ('9'));
   ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
   ASSERT_TRUE (Rust::is_numeric (0x096d)); // ७
   // `Nl`s
   ASSERT_TRUE (Rust::is_numeric (0x16e6));  // ᛮ
   ASSERT_TRUE (Rust::is_numeric (0xa6e6));  // ꛦ
   ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
   ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺
   // `No`s
   ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
   ASSERT_TRUE (Rust::is_numeric (0x32b1)); // ㊱

   ASSERT_FALSE (Rust::is_numeric ('\n'));
   ASSERT_FALSE (Rust::is_numeric ('-'));
   ASSERT_FALSE (Rust::is_numeric ('_'));
   ASSERT_FALSE (Rust::is_numeric ('('));
   ASSERT_FALSE (Rust::is_numeric ('z'));
   ASSERT_FALSE (Rust::is_numeric (';'));
   ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
   ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
   ASSERT_FALSE (Rust::is_numeric (0x0975)); // ॵ
   ASSERT_FALSE (Rust::is_numeric (0x18f0)); // ᣰ
   ASSERT_FALSE (Rust::is_numeric (0x2f30)); // ⼰
 }

 } // namespace selftest

 #endif // CHECKING_P
	// Copyright (C) 2020-2025 Free Software Foundation, Inc.

	// This file is part of GCC.

	// GCC is free software; you can redistribute it and/or modify it under
	// the terms of the GNU General Public License as published by the Free
	// Software Foundation; either version 3, or (at your option) any later
	// version.

	// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	// WARRANTY; without even the implied warranty of MERCHANTABILITY or
	// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	// for more details.

	// You should have received a copy of the GNU General Public License
	// along with GCC; see the file COPYING3. If not see
	// <http://www.gnu.org/licenses/>.

	#include "rust-input-source.h"
	#include "rust-system.h"
	#include "optional.h"
	#include "selftest.h"
	#include "rust-lex.h"
	#include "rust-unicode.h"

	#include "rust-unicode-data.h"

	namespace Rust {

	typedef Codepoint codepoint_t;
	typedef std::vector<codepoint_t> string_t;

	// These constants are used to compose and decompose of Hangul syllables.
	// See `Sample Code for Hangul Algorithms` in 3.1.2
	// unicode.org/versions/Unicode15.0.0/ch03.pdf
	const uint32_t S_BASE = 0xAC00;
	const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7;
	const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
	const uint32_t N_COUNT = V_COUNT * T_COUNT;
	const uint32_t S_COUNT = L_COUNT * N_COUNT;

	// Check if the codepoint is in any of the ranges (half-open intervals [a,b)).
	template <std::size_t SIZE>
	bool
	binary_search_ranges (
	const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
	uint32_t target_cp)
	{
	auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp,
	[] (const std::pair<uint32_t, uint32_t> &a,
	uint32_t b) { return a.second <= b; });
	if (it == ranges.end ())
	return false;
	else
	return it->first <= target_cp && target_cp < it->second;
	}

	int
	lookup_cc (codepoint_t c)
	{
	auto it = CCC_TABLE.find (c.value);
	if (it != CCC_TABLE.end ())
	return it->second;
	else
	// Starter. Returns zero.
	return 0;
	}

	tl::optional<codepoint_t>
	lookup_recomp (codepoint_t starter, codepoint_t c)
	{
	auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value});
	if (it != Rust::RECOMPOSITION_MAP.end ())
	return {it->second};

	it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0});
	if (it != Rust::RECOMPOSITION_MAP.end ())
	return {it->second};

	return tl::nullopt;
	}

	void
	recursive_decomp_cano (codepoint_t c, string_t &buf)
	{
	auto it = Rust::DECOMPOSITION_MAP.find (c.value);
	if (it != Rust::DECOMPOSITION_MAP.end ())
	{
	std::vector<uint32_t> decomped = it->second;
	for (uint32_t cp : decomped)
	recursive_decomp_cano (cp, buf);
	}
	else
	buf.push_back (c);
	}

	string_t
	decomp_cano (string_t s)
	{
	string_t buf;
	for (codepoint_t c : s)
	{
	int64_t s_index = c.value - S_BASE;
	if (0 <= s_index && s_index < S_COUNT)
	{
	// decompose Hangul argorithmically
	uint32_t l = L_BASE + s_index / N_COUNT;
	uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
	uint32_t t = T_BASE + s_index % T_COUNT;
	buf.push_back (l);
	buf.push_back (v);
	if (t != T_BASE)
	buf.push_back (t);
	continue;
	}

	// Current character is not hangul
	recursive_decomp_cano (c, buf);
	}
	return buf;
	}

	void
	sort_cano (string_t &s)
	{
	int cc_here, cc_prev;
	if (s.size () == 1)
	return;
	for (unsigned int i = 1; i < s.size (); i++)
	{
	cc_here = lookup_cc (s[i]);
	cc_prev = lookup_cc (s[i - 1]);
	if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
	{
	// swap
	codepoint_t tmp = s[i];
	s[i] = s[i - 1];
	s[i - 1] = tmp;
	if (i > 1)
	i -= 2;
	}
	}
	}

	string_t
	compose_hangul (string_t s)
	{
	string_t buf;
	if (s.size () < 2)
	return s;

	codepoint_t last = s[0];
	buf.push_back (last);
	for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
	{
	codepoint_t ch = s[src_pos];

	// L V => LV
	int64_t l_index = last.value - L_BASE;
	if (0 <= l_index && l_index < L_COUNT)
	{
	int64_t v_index = ch.value - V_BASE;
	if (0 <= v_index && v_index < V_COUNT)
	{
	last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
	// pop L
	buf.pop_back ();
	buf.push_back (last);
	continue;
	}
	}

	// LV T => LVT
	int64_t s_index = last.value - S_BASE;
	if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
	{
	int64_t t_index = ch.value - T_BASE;
	if (0 < t_index && t_index < T_COUNT)
	{
	last.value += t_index;
	// pop LV
	buf.pop_back ();
	buf.push_back (last);
	continue;
	}
	}
	last = ch;
	buf.push_back (last);
	}
	return buf;
	}

	string_t
	recomp (string_t s)
	{
	// compose hangul first
	s = compose_hangul (s);

	string_t buf;
	if (s.size () < 2)
	return s;

	int last_class = -1;
	// int starter_pos = 0; // Assume the first character is Starter. Correct?
	// int target_pos = 1;
	codepoint_t starter_ch = s[0];

	for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
	{
	// get current character
	codepoint_t ch = s[src_pos];

	int ch_class = lookup_cc (ch);
	tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
	if (composite.has_value () && last_class < ch_class)
	{
	// ch can be composed
	buf.push_back (composite.value ());
	starter_ch = composite.value ();
	}
	else if (ch_class == 0)
	{
	// ch is Starter and cannot be composed.
	if (src_pos == 1)
	// FIXME: buggy?
	buf.push_back (starter_ch);
	starter_ch = ch;
	last_class = -1;
	buf.push_back (ch);
	}
	else
	{
	if (src_pos == 1)
	// FIXME: buggy?
	buf.push_back (starter_ch);
	// ch is not Starter.
	last_class = ch_class;
	buf.push_back (ch);
	}
	}
	return buf;
	}

	// see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
	QuickCheckResult
	nfc_quick_check (const string_t &s)
	{
	int last_canonical_class = 0;
	QuickCheckResult res = QuickCheckResult::YES;

	for (unsigned long i = 0; i < s.size (); i++)
	{
	codepoint_t c = s[i];

	if (c.is_supplementary_character ())
	i++;

	int canonical_class = lookup_cc (c);
	if (last_canonical_class > canonical_class && canonical_class != 0)
	return QuickCheckResult::NO;

	if (is_nfc_qc_no (c.value))
	return QuickCheckResult::NO;

	if (is_nfc_qc_maybe (c.value))
	res = QuickCheckResult::MAYBE;

	last_canonical_class = canonical_class;
	}
	return res;
	}

	string_t
	nfc_normalize (const string_t &s)
	{
	if (nfc_quick_check (s) == QuickCheckResult::YES)
	return s;

	// TODO: optimize normalization.
	// i.e. only normalize a limited area around MAYBE character, instead of
	// performing complete normlization of the entire string

	// decompose
	string_t d = decomp_cano (s);
	sort_cano (d);

	// recompose
	string_t r = recomp (d);
	return r;
	}

	Utf8String
	Utf8String::nfc_normalize () const
	{
	return Utf8String (Rust::nfc_normalize (chars));
	}

	bool
	is_alphabetic (uint32_t codepoint)
	{
	return binary_search_ranges (ALPHABETIC_RANGES, codepoint);
	}

	bool
	is_numeric (uint32_t codepoint)
	{
	return std::binary_search (NUMERIC_CODEPOINTS.begin (),
	NUMERIC_CODEPOINTS.end (), codepoint);
	}

	bool
	is_nfc_qc_maybe (uint32_t codepoint)
	{
	return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint);
	}

	bool
	is_nfc_qc_no (uint32_t codepoint)
	{
	return binary_search_ranges (NFC_QC_NO_RANGES, codepoint);
	}

	bool
	is_ascii_only (const std::string &str)
	{
	for (char c : str)
	if (static_cast<uint32_t> (c) > MAX_ASCII_CODEPOINT)
	return false;
	return true;
	}

	} // namespace Rust

	#if CHECKING_P

	namespace selftest {

	void
	rust_nfc_qc_test ()
	{
	ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}),
	Rust::QuickCheckResult::YES);
	ASSERT_EQ (Rust::nfc_quick_check (
	{0x1e0a /* NFC_QC_YES /, 0x0323 / NFC_QC_MAYBE */}),
	Rust::QuickCheckResult::MAYBE);
	ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}),
	Rust::QuickCheckResult::NO);
	}

	void
	assert_normalize (const std::vector<Rust::Codepoint> origin,
	const std::vector<Rust::Codepoint> expected)
	{
	std::vector<Rust::Codepoint> actual = Rust::nfc_normalize (origin);

	ASSERT_EQ (actual.size (), expected.size ());
	for (unsigned int i = 0; i < actual.size (); i++)
	{
	ASSERT_EQ (actual[i], expected[i]);
	}
	}

	void
	rust_utf8_normalize_test ()
	{
	// ASCII
	assert_normalize ({'h', 'e', 'l', 'l', 'o'}, {'h', 'e', 'l', 'l', 'o'});
	// ASCII
	assert_normalize ({'/', '\\', '.', ':', ''}, {'/', '\\', '.', ':', ''});

	// testcases retrieved from Part0 of
	// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
	assert_normalize ({0x1e0a}, {0x1e0a});
	assert_normalize ({0x1e0c}, {0x1e0c});
	assert_normalize ({0x1e0a, 0x0323}, {0x1e0c, 0x0307});
	assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
	assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});

	// testcases for Hangul from Part0
	assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01});
	assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8});
	// testcases for Hangul from Part1
	assert_normalize ({0x3131}, {0x3131});
	assert_normalize ({0x3132}, {0x3132});
	// testcases for Hangul from Part3
	assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161});
	assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae});

	// TODO: add more testcases in
	// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
	}

	void
	rust_utf8_property_test ()
	{
	ASSERT_TRUE (Rust::is_alphabetic ('A'));
	ASSERT_TRUE (Rust::is_alphabetic ('B'));
	ASSERT_TRUE (Rust::is_alphabetic ('x'));
	ASSERT_TRUE (Rust::is_alphabetic ('z'));
	ASSERT_TRUE (Rust::is_alphabetic (0x00b5)); // µ
	ASSERT_TRUE (Rust::is_alphabetic (0x3093)); // ん
	ASSERT_TRUE (Rust::is_alphabetic (0xa8f2)); // ꣲ
	ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃

	ASSERT_FALSE (Rust::is_alphabetic ('\v'));
	ASSERT_FALSE (Rust::is_alphabetic ('-'));
	ASSERT_FALSE (Rust::is_alphabetic ('_'));
	ASSERT_FALSE (Rust::is_alphabetic ('+'));
	ASSERT_FALSE (Rust::is_alphabetic ('0'));
	ASSERT_FALSE (Rust::is_alphabetic ('1'));
	ASSERT_FALSE (Rust::is_alphabetic ('2'));
	ASSERT_FALSE (Rust::is_alphabetic ('9'));
	ASSERT_FALSE (Rust::is_alphabetic (0xa720)); // ◌
	ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁

	// `Nd`s
	ASSERT_TRUE (Rust::is_numeric ('0'));
	ASSERT_TRUE (Rust::is_numeric ('1'));
	ASSERT_TRUE (Rust::is_numeric ('7'));
	ASSERT_TRUE (Rust::is_numeric ('9'));
	ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
	ASSERT_TRUE (Rust::is_numeric (0x096d)); // ७
	// `Nl`s
	ASSERT_TRUE (Rust::is_numeric (0x16e6)); // ᛮ
	ASSERT_TRUE (Rust::is_numeric (0xa6e6)); // ꛦ
	ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
	ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺
	// `No`s
	ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
	ASSERT_TRUE (Rust::is_numeric (0x32b1)); // ㊱

	ASSERT_FALSE (Rust::is_numeric ('\n'));
	ASSERT_FALSE (Rust::is_numeric ('-'));
	ASSERT_FALSE (Rust::is_numeric ('_'));
	ASSERT_FALSE (Rust::is_numeric ('('));
	ASSERT_FALSE (Rust::is_numeric ('z'));
	ASSERT_FALSE (Rust::is_numeric (';'));
	ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
	ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
	ASSERT_FALSE (Rust::is_numeric (0x0975)); // ॵ
	ASSERT_FALSE (Rust::is_numeric (0x18f0)); // ᣰ
	ASSERT_FALSE (Rust::is_numeric (0x2f30)); // ⼰
	}

	} // namespace selftest

	#endif // CHECKING_P