| ------------------------------------------------------------------------------ |
| -- -- |
| -- GNAT RUN-TIME COMPONENTS -- |
| -- -- |
| -- ADA.STRINGS.UTF_ENCODING.WIDE_STRINGS -- |
| -- -- |
| -- B o d y -- |
| -- -- |
| -- Copyright (C) 2010-2022, Free Software Foundation, Inc. -- |
| -- -- |
| -- GNAT is free software; you can redistribute it and/or modify it under -- |
| -- terms of the GNU General Public License as published by the Free Soft- -- |
| -- ware Foundation; either version 3, or (at your option) any later ver- -- |
| -- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- |
| -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- |
| -- or FITNESS FOR A PARTICULAR PURPOSE. -- |
| -- -- |
| -- As a special exception under Section 7 of GPL version 3, you are granted -- |
| -- additional permissions described in the GCC Runtime Library Exception, -- |
| -- version 3.1, as published by the Free Software Foundation. -- |
| -- -- |
| -- You should have received a copy of the GNU General Public License and -- |
| -- a copy of the GCC Runtime Library Exception along with this program; -- |
| -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- |
| -- <http://www.gnu.org/licenses/>. -- |
| -- -- |
| -- GNAT was originally developed by the GNAT team at New York University. -- |
| -- Extensive contributions were provided by Ada Core Technologies Inc. -- |
| -- -- |
| ------------------------------------------------------------------------------ |
| |
| package body Ada.Strings.UTF_Encoding.Wide_Strings is |
| use Interfaces; |
| |
| ------------ |
| -- Decode -- |
| ------------ |
| |
| -- Decode UTF-8/UTF-16BE/UTF-16LE input to Wide_String |
| |
| function Decode |
| (Item : UTF_String; |
| Input_Scheme : Encoding_Scheme) return Wide_String |
| is |
| begin |
| if Input_Scheme = UTF_8 then |
| return Decode (Item); |
| else |
| return Decode (To_UTF_16 (Item, Input_Scheme)); |
| end if; |
| end Decode; |
| |
| -- Decode UTF-8 input to Wide_String |
| |
| function Decode (Item : UTF_8_String) return Wide_String is |
| Result : Wide_String (1 .. Item'Length); |
| -- Result string (worst case is same length as input) |
| |
| Len : Natural := 0; |
| -- Length of result stored so far |
| |
| Iptr : Natural; |
| -- Input Item pointer |
| |
| C : Unsigned_8; |
| R : Unsigned_16; |
| |
| procedure Get_Continuation; |
| -- Reads a continuation byte of the form 10xxxxxx, shifts R left by 6 |
| -- bits, and or's in the xxxxxx to the low order 6 bits. On return Ptr |
| -- is incremented. Raises exception if continuation byte does not exist |
| -- or is invalid. |
| |
| ---------------------- |
| -- Get_Continuation -- |
| ---------------------- |
| |
| procedure Get_Continuation is |
| begin |
| if Iptr > Item'Last then |
| Raise_Encoding_Error (Iptr - 1); |
| |
| else |
| C := To_Unsigned_8 (Item (Iptr)); |
| Iptr := Iptr + 1; |
| |
| if C not in 2#10_000000# .. 2#10_111111# then |
| Raise_Encoding_Error (Iptr - 1); |
| else |
| R := Shift_Left (R, 6) or Unsigned_16 (C and 2#00_111111#); |
| end if; |
| end if; |
| end Get_Continuation; |
| |
| -- Start of processing for Decode |
| |
| begin |
| Iptr := Item'First; |
| |
| -- Skip BOM at start |
| |
| if Item'Length >= 3 |
| and then Item (Iptr .. Iptr + 2) = BOM_8 |
| then |
| Iptr := Iptr + 3; |
| |
| -- Error if bad BOM |
| |
| elsif Item'Length >= 2 |
| and then (Item (Iptr .. Iptr + 1) = BOM_16BE |
| or else |
| Item (Iptr .. Iptr + 1) = BOM_16LE) |
| then |
| Raise_Encoding_Error (Iptr); |
| end if; |
| |
| while Iptr <= Item'Last loop |
| C := To_Unsigned_8 (Item (Iptr)); |
| Iptr := Iptr + 1; |
| |
| -- Codes in the range 16#00# - 16#7F# are represented as |
| -- 0xxxxxxx |
| |
| if C <= 16#7F# then |
| R := Unsigned_16 (C); |
| |
| -- No initial code can be of the form 10xxxxxx. Such codes are used |
| -- only for continuations. |
| |
| elsif C <= 2#10_111111# then |
| Raise_Encoding_Error (Iptr - 1); |
| |
| -- Codes in the range 16#80# - 16#7FF# are represented as |
| -- 110yyyxx 10xxxxxx |
| |
| elsif C <= 2#110_11111# then |
| R := Unsigned_16 (C and 2#000_11111#); |
| Get_Continuation; |
| |
| -- Codes in the range 16#800# - 16#FFFF# are represented as |
| -- 1110yyyy 10yyyyxx 10xxxxxx |
| |
| elsif C <= 2#1110_1111# then |
| R := Unsigned_16 (C and 2#0000_1111#); |
| Get_Continuation; |
| Get_Continuation; |
| |
| -- Codes in the range 16#10000# - 16#10FFFF# are represented as |
| -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx |
| |
| -- Such codes are out of range for Wide_String output |
| |
| else |
| Raise_Encoding_Error (Iptr - 1); |
| end if; |
| |
| Len := Len + 1; |
| Result (Len) := Wide_Character'Val (R); |
| end loop; |
| |
| return Result (1 .. Len); |
| end Decode; |
| |
| -- Decode UTF-16 input to Wide_String |
| |
| function Decode (Item : UTF_16_Wide_String) return Wide_String is |
| Result : Wide_String (1 .. Item'Length); |
| -- Result is same length as input (possibly minus 1 if BOM present) |
| |
| Len : Natural := 0; |
| -- Length of result |
| |
| Iptr : Natural; |
| -- Index of next Item element |
| |
| C : Unsigned_16; |
| |
| begin |
| -- Skip UTF-16 BOM at start |
| |
| Iptr := Item'First; |
| |
| if Item'Length > 0 and then Item (Iptr) = BOM_16 (1) then |
| Iptr := Iptr + 1; |
| end if; |
| |
| -- Loop through input characters |
| |
| while Iptr <= Item'Last loop |
| C := To_Unsigned_16 (Item (Iptr)); |
| Iptr := Iptr + 1; |
| |
| -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD# |
| -- represent their own value. |
| |
| if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then |
| Len := Len + 1; |
| Result (Len) := Wide_Character'Val (C); |
| |
| -- Codes in the range 16#D800#..16#DBFF# represent the first of the |
| -- two surrogates used to encode the range 16#01_000#..16#10_FFFF". |
| -- Such codes are out of range for 16-bit output. |
| |
| -- The case of input in the range 16#DC00#..16#DFFF# must never |
| -- occur, since it means we have a second surrogate character with |
| -- no corresponding first surrogate. |
| |
| -- Codes in the range 16#FFFE# .. 16#FFFF# are also invalid since |
| -- they conflict with codes used for BOM values. |
| |
| -- Thus all remaining codes are invalid |
| |
| else |
| Raise_Encoding_Error (Iptr - 1); |
| end if; |
| end loop; |
| |
| return Result (1 .. Len); |
| end Decode; |
| |
| ------------ |
| -- Encode -- |
| ------------ |
| |
| -- Encode Wide_String in UTF-8, UTF-16BE or UTF-16LE |
| |
| function Encode |
| (Item : Wide_String; |
| Output_Scheme : Encoding_Scheme; |
| Output_BOM : Boolean := False) return UTF_String |
| is |
| begin |
| -- Case of UTF_8 |
| |
| if Output_Scheme = UTF_8 then |
| return Encode (Item, Output_BOM); |
| |
| -- Case of UTF_16LE or UTF_16BE, use UTF-16 intermediary |
| |
| else |
| return From_UTF_16 (UTF_16_Wide_String'(Encode (Item)), |
| Output_Scheme, Output_BOM); |
| end if; |
| end Encode; |
| |
| -- Encode Wide_String in UTF-8 |
| |
| function Encode |
| (Item : Wide_String; |
| Output_BOM : Boolean := False) return UTF_8_String |
| is |
| Result : UTF_8_String (1 .. 3 * Item'Length + 3); |
| -- Worst case is three bytes per input byte + space for BOM |
| |
| Len : Natural; |
| -- Number of output codes stored in Result |
| |
| C : Unsigned_16; |
| -- Single input character |
| |
| procedure Store (C : Unsigned_16); |
| pragma Inline (Store); |
| -- Store one output code, C is in the range 0 .. 255 |
| |
| ----------- |
| -- Store -- |
| ----------- |
| |
| procedure Store (C : Unsigned_16) is |
| begin |
| Len := Len + 1; |
| Result (Len) := Character'Val (C); |
| end Store; |
| |
| -- Start of processing for UTF8_Encode |
| |
| begin |
| -- Output BOM if required |
| |
| if Output_BOM then |
| Result (1 .. 3) := BOM_8; |
| Len := 3; |
| else |
| Len := 0; |
| end if; |
| |
| -- Loop through characters of input |
| |
| for J in Item'Range loop |
| C := To_Unsigned_16 (Item (J)); |
| |
| -- Codes in the range 16#00# - 16#7F# are represented as |
| -- 0xxxxxxx |
| |
| if C <= 16#7F# then |
| Store (C); |
| |
| -- Codes in the range 16#80# - 16#7FF# are represented as |
| -- 110yyyxx 10xxxxxx |
| |
| elsif C <= 16#7FF# then |
| Store (2#110_00000# or Shift_Right (C, 6)); |
| Store (2#10_000000# or (C and 2#00_111111#)); |
| |
| -- Codes in the range 16#800# - 16#FFFF# are represented as |
| -- 1110yyyy 10yyyyxx 10xxxxxx |
| |
| else |
| Store (2#1110_0000# or Shift_Right (C, 12)); |
| Store (2#10_000000# or |
| Shift_Right (C and 2#111111_000000#, 6)); |
| Store (2#10_000000# or (C and 2#00_111111#)); |
| end if; |
| end loop; |
| |
| return Result (1 .. Len); |
| end Encode; |
| |
| -- Encode Wide_String in UTF-16 |
| |
| function Encode |
| (Item : Wide_String; |
| Output_BOM : Boolean := False) return UTF_16_Wide_String |
| is |
| Result : UTF_16_Wide_String |
| (1 .. Item'Length + Boolean'Pos (Output_BOM)); |
| -- Output is same length as input + possible BOM |
| |
| Len : Integer; |
| -- Length of output string |
| |
| C : Unsigned_16; |
| |
| begin |
| -- Output BOM if required |
| |
| if Output_BOM then |
| Result (1) := BOM_16 (1); |
| Len := 1; |
| else |
| Len := 0; |
| end if; |
| |
| -- Loop through input characters encoding them |
| |
| for Iptr in Item'Range loop |
| C := To_Unsigned_16 (Item (Iptr)); |
| |
| -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD# are |
| -- output unchanged. |
| |
| if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then |
| Len := Len + 1; |
| Result (Len) := Wide_Character'Val (C); |
| |
| -- Codes in the range 16#D800#..16#DFFF# should never appear in the |
| -- input, since no valid Unicode characters are in this range (which |
| -- would conflict with the UTF-16 surrogate encodings). Similarly |
| -- codes in the range 16#FFFE#..16#FFFF conflict with BOM codes. |
| -- Thus all remaining codes are illegal. |
| |
| else |
| Raise_Encoding_Error (Iptr); |
| end if; |
| end loop; |
| |
| return Result; |
| end Encode; |
| |
| end Ada.Strings.UTF_Encoding.Wide_Strings; |