| ------------------------------------------------------------------------------ |
| -- -- |
| -- GNAT RUN-TIME COMPONENTS -- |
| -- -- |
| -- G N A T . D E C O D E _ S T R I N G -- |
| -- -- |
| -- S p e c -- |
| -- -- |
| -- Copyright (C) 2007-2022, AdaCore -- |
| -- -- |
| -- GNAT is free software; you can redistribute it and/or modify it under -- |
| -- terms of the GNU General Public License as published by the Free Soft- -- |
| -- ware Foundation; either version 3, or (at your option) any later ver- -- |
| -- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- |
| -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- |
| -- or FITNESS FOR A PARTICULAR PURPOSE. -- |
| -- -- |
| -- As a special exception under Section 7 of GPL version 3, you are granted -- |
| -- additional permissions described in the GCC Runtime Library Exception, -- |
| -- version 3.1, as published by the Free Software Foundation. -- |
| -- -- |
| -- You should have received a copy of the GNU General Public License and -- |
| -- a copy of the GCC Runtime Library Exception along with this program; -- |
| -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- |
| -- <http://www.gnu.org/licenses/>. -- |
| -- -- |
| -- GNAT was originally developed by the GNAT team at New York University. -- |
| -- Extensive contributions were provided by Ada Core Technologies Inc. -- |
| -- -- |
| ------------------------------------------------------------------------------ |
| |
| -- This package provides a utility routine for converting from an encoded |
| -- string to a corresponding Wide_String or Wide_Wide_String value. |
| |
| with Interfaces; use Interfaces; |
| |
| with System.WCh_Cnv; use System.WCh_Cnv; |
| with System.WCh_Con; use System.WCh_Con; |
| |
| package body GNAT.Decode_String is |
| |
| ----------------------- |
| -- Local Subprograms -- |
| ----------------------- |
| |
| procedure Bad; |
| pragma No_Return (Bad); |
| -- Raise error for bad encoding |
| |
| procedure Past_End; |
| pragma No_Return (Past_End); |
| -- Raise error for off end of string |
| |
| --------- |
| -- Bad -- |
| --------- |
| |
| procedure Bad is |
| begin |
| raise Constraint_Error with |
| "bad encoding or character out of range"; |
| end Bad; |
| |
| --------------------------- |
| -- Decode_Wide_Character -- |
| --------------------------- |
| |
| procedure Decode_Wide_Character |
| (Input : String; |
| Ptr : in out Natural; |
| Result : out Wide_Character) |
| is |
| Char : Wide_Wide_Character; |
| begin |
| Decode_Wide_Wide_Character (Input, Ptr, Char); |
| |
| if Wide_Wide_Character'Pos (Char) > 16#FFFF# then |
| Bad; |
| else |
| Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char)); |
| end if; |
| end Decode_Wide_Character; |
| |
| ------------------------ |
| -- Decode_Wide_String -- |
| ------------------------ |
| |
| function Decode_Wide_String (S : String) return Wide_String is |
| Result : Wide_String (1 .. S'Length); |
| Length : Natural; |
| begin |
| Decode_Wide_String (S, Result, Length); |
| return Result (1 .. Length); |
| end Decode_Wide_String; |
| |
| procedure Decode_Wide_String |
| (S : String; |
| Result : out Wide_String; |
| Length : out Natural) |
| is |
| Ptr : Natural; |
| |
| begin |
| Ptr := S'First; |
| Length := 0; |
| while Ptr <= S'Last loop |
| if Length >= Result'Last then |
| Past_End; |
| end if; |
| |
| Length := Length + 1; |
| Decode_Wide_Character (S, Ptr, Result (Length)); |
| end loop; |
| end Decode_Wide_String; |
| |
| -------------------------------- |
| -- Decode_Wide_Wide_Character -- |
| -------------------------------- |
| |
| procedure Decode_Wide_Wide_Character |
| (Input : String; |
| Ptr : in out Natural; |
| Result : out Wide_Wide_Character) |
| is |
| C : Character; |
| |
| function In_Char return Character; |
| pragma Inline (In_Char); |
| -- Function to get one input character |
| |
| ------------- |
| -- In_Char -- |
| ------------- |
| |
| function In_Char return Character is |
| begin |
| if Ptr <= Input'Last then |
| Ptr := Ptr + 1; |
| return Input (Ptr - 1); |
| else |
| Past_End; |
| end if; |
| end In_Char; |
| |
| -- Start of processing for Decode_Wide_Wide_Character |
| |
| begin |
| C := In_Char; |
| |
| -- Special fast processing for UTF-8 case |
| |
| if Encoding_Method = WCEM_UTF8 then |
| UTF8 : declare |
| U : Unsigned_32; |
| W : Unsigned_32; |
| |
| procedure Get_UTF_Byte; |
| pragma Inline (Get_UTF_Byte); |
| -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode. |
| -- Reads a byte, and raises CE if the first two bits are not 10. |
| -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits. |
| |
| ------------------ |
| -- Get_UTF_Byte -- |
| ------------------ |
| |
| procedure Get_UTF_Byte is |
| begin |
| U := Unsigned_32 (Character'Pos (In_Char)); |
| |
| if (U and 2#11000000#) /= 2#10_000000# then |
| Bad; |
| end if; |
| |
| W := Shift_Left (W, 6) or (U and 2#00111111#); |
| end Get_UTF_Byte; |
| |
| -- Start of processing for UTF8 case |
| |
| begin |
| -- Note: for details of UTF8 encoding see RFC 3629 |
| |
| U := Unsigned_32 (Character'Pos (C)); |
| |
| -- 16#00_0000#-16#00_007F#: 0xxxxxxx |
| |
| if (U and 2#10000000#) = 2#00000000# then |
| Result := Wide_Wide_Character'Val (Character'Pos (C)); |
| |
| -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx |
| |
| elsif (U and 2#11100000#) = 2#110_00000# then |
| W := U and 2#00011111#; |
| Get_UTF_Byte; |
| |
| if W not in 16#00_0080# .. 16#00_07FF# then |
| Bad; |
| end if; |
| |
| Result := Wide_Wide_Character'Val (W); |
| |
| -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx |
| |
| elsif (U and 2#11110000#) = 2#1110_0000# then |
| W := U and 2#00001111#; |
| Get_UTF_Byte; |
| Get_UTF_Byte; |
| |
| if W not in 16#00_0800# .. 16#00_FFFF# then |
| Bad; |
| end if; |
| |
| Result := Wide_Wide_Character'Val (W); |
| |
| -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| |
| elsif (U and 2#11111000#) = 2#11110_000# then |
| W := U and 2#00000111#; |
| |
| for K in 1 .. 3 loop |
| Get_UTF_Byte; |
| end loop; |
| |
| if W not in 16#01_0000# .. 16#10_FFFF# then |
| Bad; |
| end if; |
| |
| Result := Wide_Wide_Character'Val (W); |
| |
| -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx |
| -- 10xxxxxx 10xxxxxx |
| |
| elsif (U and 2#11111100#) = 2#111110_00# then |
| W := U and 2#00000011#; |
| |
| for K in 1 .. 4 loop |
| Get_UTF_Byte; |
| end loop; |
| |
| if W not in 16#0020_0000# .. 16#03FF_FFFF# then |
| Bad; |
| end if; |
| |
| Result := Wide_Wide_Character'Val (W); |
| |
| -- All other cases are invalid, note that this includes: |
| |
| -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx |
| -- 10xxxxxx 10xxxxxx 10xxxxxx |
| |
| -- since Wide_Wide_Character does not include code values |
| -- greater than 16#03FF_FFFF#. |
| |
| else |
| Bad; |
| end if; |
| end UTF8; |
| |
| -- All encoding functions other than UTF-8 |
| |
| else |
| Non_UTF8 : declare |
| function Char_Sequence_To_UTF is |
| new Char_Sequence_To_UTF_32 (In_Char); |
| |
| begin |
| -- For brackets, must test for specific case of [ not followed by |
| -- quotation, where we must not call Char_Sequence_To_UTF, but |
| -- instead just return the bracket unchanged. |
| |
| if Encoding_Method = WCEM_Brackets |
| and then C = '[' |
| and then (Ptr > Input'Last or else Input (Ptr) /= '"') |
| then |
| Result := '['; |
| |
| -- All other cases including [" with Brackets |
| |
| else |
| Result := |
| Wide_Wide_Character'Val |
| (Char_Sequence_To_UTF (C, Encoding_Method)); |
| end if; |
| end Non_UTF8; |
| end if; |
| end Decode_Wide_Wide_Character; |
| |
| ----------------------------- |
| -- Decode_Wide_Wide_String -- |
| ----------------------------- |
| |
| function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is |
| Result : Wide_Wide_String (1 .. S'Length); |
| Length : Natural; |
| begin |
| Decode_Wide_Wide_String (S, Result, Length); |
| return Result (1 .. Length); |
| end Decode_Wide_Wide_String; |
| |
| procedure Decode_Wide_Wide_String |
| (S : String; |
| Result : out Wide_Wide_String; |
| Length : out Natural) |
| is |
| Ptr : Natural; |
| |
| begin |
| Ptr := S'First; |
| Length := 0; |
| while Ptr <= S'Last loop |
| if Length >= Result'Last then |
| Past_End; |
| end if; |
| |
| Length := Length + 1; |
| Decode_Wide_Wide_Character (S, Ptr, Result (Length)); |
| end loop; |
| end Decode_Wide_Wide_String; |
| |
| ------------------------- |
| -- Next_Wide_Character -- |
| ------------------------- |
| |
| procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is |
| Discard : Wide_Character; |
| begin |
| Decode_Wide_Character (Input, Ptr, Discard); |
| end Next_Wide_Character; |
| |
| ------------------------------ |
| -- Next_Wide_Wide_Character -- |
| ------------------------------ |
| |
| procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is |
| Discard : Wide_Wide_Character; |
| begin |
| Decode_Wide_Wide_Character (Input, Ptr, Discard); |
| end Next_Wide_Wide_Character; |
| |
| -------------- |
| -- Past_End -- |
| -------------- |
| |
| procedure Past_End is |
| begin |
| raise Constraint_Error with "past end of string"; |
| end Past_End; |
| |
| ------------------------- |
| -- Prev_Wide_Character -- |
| ------------------------- |
| |
| procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is |
| begin |
| if Ptr > Input'Last + 1 then |
| Past_End; |
| end if; |
| |
| -- Special efficient encoding for UTF-8 case |
| |
| if Encoding_Method = WCEM_UTF8 then |
| UTF8 : declare |
| U : Unsigned_32; |
| |
| procedure Getc; |
| pragma Inline (Getc); |
| -- Gets the character at Input (Ptr - 1) and returns code in U as |
| -- Unsigned_32 value. On return Ptr is decremented by one. |
| |
| procedure Skip_UTF_Byte; |
| pragma Inline (Skip_UTF_Byte); |
| -- Checks that U is 2#10xxxxxx# and then calls Get |
| |
| ---------- |
| -- Getc -- |
| ---------- |
| |
| procedure Getc is |
| begin |
| if Ptr <= Input'First then |
| Past_End; |
| else |
| Ptr := Ptr - 1; |
| U := Unsigned_32 (Character'Pos (Input (Ptr))); |
| end if; |
| end Getc; |
| |
| ------------------- |
| -- Skip_UTF_Byte -- |
| ------------------- |
| |
| procedure Skip_UTF_Byte is |
| begin |
| if (U and 2#11000000#) = 2#10_000000# then |
| Getc; |
| else |
| Bad; |
| end if; |
| end Skip_UTF_Byte; |
| |
| -- Start of processing for UTF-8 case |
| |
| begin |
| -- 16#00_0000#-16#00_007F#: 0xxxxxxx |
| |
| Getc; |
| |
| if (U and 2#10000000#) = 2#00000000# then |
| return; |
| |
| -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx |
| |
| else |
| Skip_UTF_Byte; |
| |
| if (U and 2#11100000#) = 2#110_00000# then |
| return; |
| |
| -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx |
| |
| else |
| Skip_UTF_Byte; |
| |
| if (U and 2#11110000#) = 2#1110_0000# then |
| return; |
| |
| -- Any other code is invalid, note that this includes: |
| |
| -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx |
| -- 10xxxxxx |
| |
| -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx |
| -- 10xxxxxx 10xxxxxx |
| -- 10xxxxxx |
| |
| -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx |
| -- 10xxxxxx 10xxxxxx |
| -- 10xxxxxx 10xxxxxx |
| |
| -- since Wide_Character does not allow codes > 16#FFFF# |
| |
| else |
| Bad; |
| end if; |
| end if; |
| end if; |
| end UTF8; |
| |
| -- Special efficient encoding for brackets case |
| |
| elsif Encoding_Method = WCEM_Brackets then |
| Brackets : declare |
| P : Natural; |
| S : Natural; |
| |
| begin |
| -- See if we have "] at end positions |
| |
| if Ptr > Input'First + 1 |
| and then Input (Ptr - 1) = ']' |
| and then Input (Ptr - 2) = '"' |
| then |
| P := Ptr - 2; |
| |
| -- Loop back looking for [" at start |
| |
| while P >= Ptr - 10 loop |
| if P <= Input'First + 1 then |
| Bad; |
| |
| elsif Input (P - 1) = '"' |
| and then Input (P - 2) = '[' |
| then |
| -- Found ["..."], scan forward to check it |
| |
| S := P - 2; |
| P := S; |
| Next_Wide_Character (Input, P); |
| |
| -- OK if at original pointer, else error |
| |
| if P = Ptr then |
| Ptr := S; |
| return; |
| else |
| Bad; |
| end if; |
| end if; |
| |
| P := P - 1; |
| end loop; |
| |
| -- Falling through loop means more than 8 chars between the |
| -- enclosing brackets (or simply a missing left bracket) |
| |
| Bad; |
| |
| -- Here if no bracket sequence present |
| |
| else |
| if Ptr = Input'First then |
| Past_End; |
| else |
| Ptr := Ptr - 1; |
| end if; |
| end if; |
| end Brackets; |
| |
| -- Non-UTF-8/Brackets. These are the inefficient cases where we have to |
| -- go to the start of the string and skip forwards till Ptr matches. |
| |
| else |
| Non_UTF_Brackets : declare |
| Discard : Wide_Character; |
| PtrS : Natural; |
| PtrP : Natural; |
| |
| begin |
| PtrS := Input'First; |
| |
| if Ptr <= PtrS then |
| Past_End; |
| end if; |
| |
| loop |
| PtrP := PtrS; |
| Decode_Wide_Character (Input, PtrS, Discard); |
| |
| if PtrS = Ptr then |
| Ptr := PtrP; |
| return; |
| |
| elsif PtrS > Ptr then |
| Bad; |
| end if; |
| end loop; |
| |
| exception |
| when Constraint_Error => |
| Bad; |
| end Non_UTF_Brackets; |
| end if; |
| end Prev_Wide_Character; |
| |
| ------------------------------ |
| -- Prev_Wide_Wide_Character -- |
| ------------------------------ |
| |
| procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is |
| begin |
| if Ptr > Input'Last + 1 then |
| Past_End; |
| end if; |
| |
| -- Special efficient encoding for UTF-8 case |
| |
| if Encoding_Method = WCEM_UTF8 then |
| UTF8 : declare |
| U : Unsigned_32; |
| |
| procedure Getc; |
| pragma Inline (Getc); |
| -- Gets the character at Input (Ptr - 1) and returns code in U as |
| -- Unsigned_32 value. On return Ptr is decremented by one. |
| |
| procedure Skip_UTF_Byte; |
| pragma Inline (Skip_UTF_Byte); |
| -- Checks that U is 2#10xxxxxx# and then calls Get |
| |
| ---------- |
| -- Getc -- |
| ---------- |
| |
| procedure Getc is |
| begin |
| if Ptr <= Input'First then |
| Past_End; |
| else |
| Ptr := Ptr - 1; |
| U := Unsigned_32 (Character'Pos (Input (Ptr))); |
| end if; |
| end Getc; |
| |
| ------------------- |
| -- Skip_UTF_Byte -- |
| ------------------- |
| |
| procedure Skip_UTF_Byte is |
| begin |
| if (U and 2#11000000#) = 2#10_000000# then |
| Getc; |
| else |
| Bad; |
| end if; |
| end Skip_UTF_Byte; |
| |
| -- Start of processing for UTF-8 case |
| |
| begin |
| -- 16#00_0000#-16#00_007F#: 0xxxxxxx |
| |
| Getc; |
| |
| if (U and 2#10000000#) = 2#00000000# then |
| return; |
| |
| -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx |
| |
| else |
| Skip_UTF_Byte; |
| |
| if (U and 2#11100000#) = 2#110_00000# then |
| return; |
| |
| -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx |
| |
| else |
| Skip_UTF_Byte; |
| |
| if (U and 2#11110000#) = 2#1110_0000# then |
| return; |
| |
| -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx |
| -- 10xxxxxx |
| |
| else |
| Skip_UTF_Byte; |
| |
| if (U and 2#11111000#) = 2#11110_000# then |
| return; |
| |
| -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx |
| -- 10xxxxxx 10xxxxxx |
| -- 10xxxxxx |
| |
| else |
| Skip_UTF_Byte; |
| |
| if (U and 2#11111100#) = 2#111110_00# then |
| return; |
| |
| -- Any other code is invalid, note that this includes: |
| |
| -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx |
| -- 10xxxxxx 10xxxxxx |
| -- 10xxxxxx 10xxxxxx |
| |
| -- since Wide_Wide_Character does not allow codes |
| -- greater than 16#03FF_FFFF# |
| |
| else |
| Bad; |
| end if; |
| end if; |
| end if; |
| end if; |
| end if; |
| end UTF8; |
| |
| -- Special efficient encoding for brackets case |
| |
| elsif Encoding_Method = WCEM_Brackets then |
| Brackets : declare |
| P : Natural; |
| S : Natural; |
| |
| begin |
| -- See if we have "] at end positions |
| |
| if Ptr > Input'First + 1 |
| and then Input (Ptr - 1) = ']' |
| and then Input (Ptr - 2) = '"' |
| then |
| P := Ptr - 2; |
| |
| -- Loop back looking for [" at start |
| |
| while P >= Ptr - 10 loop |
| if P <= Input'First + 1 then |
| Bad; |
| |
| elsif Input (P - 1) = '"' |
| and then Input (P - 2) = '[' |
| then |
| -- Found ["..."], scan forward to check it |
| |
| S := P - 2; |
| P := S; |
| Next_Wide_Wide_Character (Input, P); |
| |
| -- OK if at original pointer, else error |
| |
| if P = Ptr then |
| Ptr := S; |
| return; |
| else |
| Bad; |
| end if; |
| end if; |
| |
| P := P - 1; |
| end loop; |
| |
| -- Falling through loop means more than 8 chars between the |
| -- enclosing brackets (or simply a missing left bracket) |
| |
| Bad; |
| |
| -- Here if no bracket sequence present |
| |
| else |
| if Ptr = Input'First then |
| Past_End; |
| else |
| Ptr := Ptr - 1; |
| end if; |
| end if; |
| end Brackets; |
| |
| -- Non-UTF-8/Brackets. These are the inefficient cases where we have to |
| -- go to the start of the string and skip forwards till Ptr matches. |
| |
| else |
| Non_UTF8_Brackets : declare |
| Discard : Wide_Wide_Character; |
| PtrS : Natural; |
| PtrP : Natural; |
| |
| begin |
| PtrS := Input'First; |
| |
| if Ptr <= PtrS then |
| Past_End; |
| end if; |
| |
| loop |
| PtrP := PtrS; |
| Decode_Wide_Wide_Character (Input, PtrS, Discard); |
| |
| if PtrS = Ptr then |
| Ptr := PtrP; |
| return; |
| |
| elsif PtrS > Ptr then |
| Bad; |
| end if; |
| end loop; |
| |
| exception |
| when Constraint_Error => |
| Bad; |
| end Non_UTF8_Brackets; |
| end if; |
| end Prev_Wide_Wide_Character; |
| |
| -------------------------- |
| -- Validate_Wide_String -- |
| -------------------------- |
| |
| function Validate_Wide_String (S : String) return Boolean is |
| Ptr : Natural; |
| |
| begin |
| Ptr := S'First; |
| while Ptr <= S'Last loop |
| Next_Wide_Character (S, Ptr); |
| end loop; |
| |
| return True; |
| |
| exception |
| when Constraint_Error => |
| return False; |
| end Validate_Wide_String; |
| |
| ------------------------------- |
| -- Validate_Wide_Wide_String -- |
| ------------------------------- |
| |
| function Validate_Wide_Wide_String (S : String) return Boolean is |
| Ptr : Natural; |
| |
| begin |
| Ptr := S'First; |
| while Ptr <= S'Last loop |
| Next_Wide_Wide_Character (S, Ptr); |
| end loop; |
| |
| return True; |
| |
| exception |
| when Constraint_Error => |
| return False; |
| end Validate_Wide_Wide_String; |
| |
| end GNAT.Decode_String; |