blob: 89824095ec188da644a4e11d02e36f1274a1b74a [file] [log] [blame]
------------------------------------------------------------------------------
-- --
-- GNAT RUN-TIME COMPONENTS --
-- --
-- G N A T . D E C O D E _ S T R I N G --
-- --
-- S p e c --
-- --
-- Copyright (C) 2007-2021, AdaCore --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
-- ware Foundation; either version 3, or (at your option) any later ver- --
-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. --
-- --
-- As a special exception under Section 7 of GPL version 3, you are granted --
-- additional permissions described in the GCC Runtime Library Exception, --
-- version 3.1, as published by the Free Software Foundation. --
-- --
-- You should have received a copy of the GNU General Public License and --
-- a copy of the GCC Runtime Library Exception along with this program; --
-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
-- <http://www.gnu.org/licenses/>. --
-- --
-- GNAT was originally developed by the GNAT team at New York University. --
-- Extensive contributions were provided by Ada Core Technologies Inc. --
-- --
------------------------------------------------------------------------------
-- This package provides a utility routine for converting from an encoded
-- string to a corresponding Wide_String or Wide_Wide_String value.
with Interfaces; use Interfaces;
with System.WCh_Cnv; use System.WCh_Cnv;
with System.WCh_Con; use System.WCh_Con;
package body GNAT.Decode_String is
-----------------------
-- Local Subprograms --
-----------------------
procedure Bad;
pragma No_Return (Bad);
-- Raise error for bad encoding
procedure Past_End;
pragma No_Return (Past_End);
-- Raise error for off end of string
---------
-- Bad --
---------
procedure Bad is
begin
raise Constraint_Error with
"bad encoding or character out of range";
end Bad;
---------------------------
-- Decode_Wide_Character --
---------------------------
procedure Decode_Wide_Character
(Input : String;
Ptr : in out Natural;
Result : out Wide_Character)
is
Char : Wide_Wide_Character;
begin
Decode_Wide_Wide_Character (Input, Ptr, Char);
if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
Bad;
else
Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
end if;
end Decode_Wide_Character;
------------------------
-- Decode_Wide_String --
------------------------
function Decode_Wide_String (S : String) return Wide_String is
Result : Wide_String (1 .. S'Length);
Length : Natural;
begin
Decode_Wide_String (S, Result, Length);
return Result (1 .. Length);
end Decode_Wide_String;
procedure Decode_Wide_String
(S : String;
Result : out Wide_String;
Length : out Natural)
is
Ptr : Natural;
begin
Ptr := S'First;
Length := 0;
while Ptr <= S'Last loop
if Length >= Result'Last then
Past_End;
end if;
Length := Length + 1;
Decode_Wide_Character (S, Ptr, Result (Length));
end loop;
end Decode_Wide_String;
--------------------------------
-- Decode_Wide_Wide_Character --
--------------------------------
procedure Decode_Wide_Wide_Character
(Input : String;
Ptr : in out Natural;
Result : out Wide_Wide_Character)
is
C : Character;
function In_Char return Character;
pragma Inline (In_Char);
-- Function to get one input character
-------------
-- In_Char --
-------------
function In_Char return Character is
begin
if Ptr <= Input'Last then
Ptr := Ptr + 1;
return Input (Ptr - 1);
else
Past_End;
end if;
end In_Char;
-- Start of processing for Decode_Wide_Wide_Character
begin
C := In_Char;
-- Special fast processing for UTF-8 case
if Encoding_Method = WCEM_UTF8 then
UTF8 : declare
U : Unsigned_32;
W : Unsigned_32;
procedure Get_UTF_Byte;
pragma Inline (Get_UTF_Byte);
-- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
-- Reads a byte, and raises CE if the first two bits are not 10.
-- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
------------------
-- Get_UTF_Byte --
------------------
procedure Get_UTF_Byte is
begin
U := Unsigned_32 (Character'Pos (In_Char));
if (U and 2#11000000#) /= 2#10_000000# then
Bad;
end if;
W := Shift_Left (W, 6) or (U and 2#00111111#);
end Get_UTF_Byte;
-- Start of processing for UTF8 case
begin
-- Note: for details of UTF8 encoding see RFC 3629
U := Unsigned_32 (Character'Pos (C));
-- 16#00_0000#-16#00_007F#: 0xxxxxxx
if (U and 2#10000000#) = 2#00000000# then
Result := Wide_Wide_Character'Val (Character'Pos (C));
-- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
elsif (U and 2#11100000#) = 2#110_00000# then
W := U and 2#00011111#;
Get_UTF_Byte;
if W not in 16#00_0080# .. 16#00_07FF# then
Bad;
end if;
Result := Wide_Wide_Character'Val (W);
-- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
elsif (U and 2#11110000#) = 2#1110_0000# then
W := U and 2#00001111#;
Get_UTF_Byte;
Get_UTF_Byte;
if W not in 16#00_0800# .. 16#00_FFFF# then
Bad;
end if;
Result := Wide_Wide_Character'Val (W);
-- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
elsif (U and 2#11111000#) = 2#11110_000# then
W := U and 2#00000111#;
for K in 1 .. 3 loop
Get_UTF_Byte;
end loop;
if W not in 16#01_0000# .. 16#10_FFFF# then
Bad;
end if;
Result := Wide_Wide_Character'Val (W);
-- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx
elsif (U and 2#11111100#) = 2#111110_00# then
W := U and 2#00000011#;
for K in 1 .. 4 loop
Get_UTF_Byte;
end loop;
if W not in 16#0020_0000# .. 16#03FF_FFFF# then
Bad;
end if;
Result := Wide_Wide_Character'Val (W);
-- All other cases are invalid, note that this includes:
-- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx 10xxxxxx
-- since Wide_Wide_Character does not include code values
-- greater than 16#03FF_FFFF#.
else
Bad;
end if;
end UTF8;
-- All encoding functions other than UTF-8
else
Non_UTF8 : declare
function Char_Sequence_To_UTF is
new Char_Sequence_To_UTF_32 (In_Char);
begin
-- For brackets, must test for specific case of [ not followed by
-- quotation, where we must not call Char_Sequence_To_UTF, but
-- instead just return the bracket unchanged.
if Encoding_Method = WCEM_Brackets
and then C = '['
and then (Ptr > Input'Last or else Input (Ptr) /= '"')
then
Result := '[';
-- All other cases including [" with Brackets
else
Result :=
Wide_Wide_Character'Val
(Char_Sequence_To_UTF (C, Encoding_Method));
end if;
end Non_UTF8;
end if;
end Decode_Wide_Wide_Character;
-----------------------------
-- Decode_Wide_Wide_String --
-----------------------------
function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
Result : Wide_Wide_String (1 .. S'Length);
Length : Natural;
begin
Decode_Wide_Wide_String (S, Result, Length);
return Result (1 .. Length);
end Decode_Wide_Wide_String;
procedure Decode_Wide_Wide_String
(S : String;
Result : out Wide_Wide_String;
Length : out Natural)
is
Ptr : Natural;
begin
Ptr := S'First;
Length := 0;
while Ptr <= S'Last loop
if Length >= Result'Last then
Past_End;
end if;
Length := Length + 1;
Decode_Wide_Wide_Character (S, Ptr, Result (Length));
end loop;
end Decode_Wide_Wide_String;
-------------------------
-- Next_Wide_Character --
-------------------------
procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
Discard : Wide_Character;
begin
Decode_Wide_Character (Input, Ptr, Discard);
end Next_Wide_Character;
------------------------------
-- Next_Wide_Wide_Character --
------------------------------
procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
Discard : Wide_Wide_Character;
begin
Decode_Wide_Wide_Character (Input, Ptr, Discard);
end Next_Wide_Wide_Character;
--------------
-- Past_End --
--------------
procedure Past_End is
begin
raise Constraint_Error with "past end of string";
end Past_End;
-------------------------
-- Prev_Wide_Character --
-------------------------
procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
begin
if Ptr > Input'Last + 1 then
Past_End;
end if;
-- Special efficient encoding for UTF-8 case
if Encoding_Method = WCEM_UTF8 then
UTF8 : declare
U : Unsigned_32;
procedure Getc;
pragma Inline (Getc);
-- Gets the character at Input (Ptr - 1) and returns code in U as
-- Unsigned_32 value. On return Ptr is decremented by one.
procedure Skip_UTF_Byte;
pragma Inline (Skip_UTF_Byte);
-- Checks that U is 2#10xxxxxx# and then calls Get
----------
-- Getc --
----------
procedure Getc is
begin
if Ptr <= Input'First then
Past_End;
else
Ptr := Ptr - 1;
U := Unsigned_32 (Character'Pos (Input (Ptr)));
end if;
end Getc;
-------------------
-- Skip_UTF_Byte --
-------------------
procedure Skip_UTF_Byte is
begin
if (U and 2#11000000#) = 2#10_000000# then
Getc;
else
Bad;
end if;
end Skip_UTF_Byte;
-- Start of processing for UTF-8 case
begin
-- 16#00_0000#-16#00_007F#: 0xxxxxxx
Getc;
if (U and 2#10000000#) = 2#00000000# then
return;
-- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
else
Skip_UTF_Byte;
if (U and 2#11100000#) = 2#110_00000# then
return;
-- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
else
Skip_UTF_Byte;
if (U and 2#11110000#) = 2#1110_0000# then
return;
-- Any other code is invalid, note that this includes:
-- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
-- 10xxxxxx
-- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
-- 10xxxxxx 10xxxxxx
-- 10xxxxxx
-- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
-- 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx
-- since Wide_Character does not allow codes > 16#FFFF#
else
Bad;
end if;
end if;
end if;
end UTF8;
-- Special efficient encoding for brackets case
elsif Encoding_Method = WCEM_Brackets then
Brackets : declare
P : Natural;
S : Natural;
begin
-- See if we have "] at end positions
if Ptr > Input'First + 1
and then Input (Ptr - 1) = ']'
and then Input (Ptr - 2) = '"'
then
P := Ptr - 2;
-- Loop back looking for [" at start
while P >= Ptr - 10 loop
if P <= Input'First + 1 then
Bad;
elsif Input (P - 1) = '"'
and then Input (P - 2) = '['
then
-- Found ["..."], scan forward to check it
S := P - 2;
P := S;
Next_Wide_Character (Input, P);
-- OK if at original pointer, else error
if P = Ptr then
Ptr := S;
return;
else
Bad;
end if;
end if;
P := P - 1;
end loop;
-- Falling through loop means more than 8 chars between the
-- enclosing brackets (or simply a missing left bracket)
Bad;
-- Here if no bracket sequence present
else
if Ptr = Input'First then
Past_End;
else
Ptr := Ptr - 1;
end if;
end if;
end Brackets;
-- Non-UTF-8/Brackets. These are the inefficient cases where we have to
-- go to the start of the string and skip forwards till Ptr matches.
else
Non_UTF_Brackets : declare
Discard : Wide_Character;
PtrS : Natural;
PtrP : Natural;
begin
PtrS := Input'First;
if Ptr <= PtrS then
Past_End;
end if;
loop
PtrP := PtrS;
Decode_Wide_Character (Input, PtrS, Discard);
if PtrS = Ptr then
Ptr := PtrP;
return;
elsif PtrS > Ptr then
Bad;
end if;
end loop;
exception
when Constraint_Error =>
Bad;
end Non_UTF_Brackets;
end if;
end Prev_Wide_Character;
------------------------------
-- Prev_Wide_Wide_Character --
------------------------------
procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
begin
if Ptr > Input'Last + 1 then
Past_End;
end if;
-- Special efficient encoding for UTF-8 case
if Encoding_Method = WCEM_UTF8 then
UTF8 : declare
U : Unsigned_32;
procedure Getc;
pragma Inline (Getc);
-- Gets the character at Input (Ptr - 1) and returns code in U as
-- Unsigned_32 value. On return Ptr is decremented by one.
procedure Skip_UTF_Byte;
pragma Inline (Skip_UTF_Byte);
-- Checks that U is 2#10xxxxxx# and then calls Get
----------
-- Getc --
----------
procedure Getc is
begin
if Ptr <= Input'First then
Past_End;
else
Ptr := Ptr - 1;
U := Unsigned_32 (Character'Pos (Input (Ptr)));
end if;
end Getc;
-------------------
-- Skip_UTF_Byte --
-------------------
procedure Skip_UTF_Byte is
begin
if (U and 2#11000000#) = 2#10_000000# then
Getc;
else
Bad;
end if;
end Skip_UTF_Byte;
-- Start of processing for UTF-8 case
begin
-- 16#00_0000#-16#00_007F#: 0xxxxxxx
Getc;
if (U and 2#10000000#) = 2#00000000# then
return;
-- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
else
Skip_UTF_Byte;
if (U and 2#11100000#) = 2#110_00000# then
return;
-- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
else
Skip_UTF_Byte;
if (U and 2#11110000#) = 2#1110_0000# then
return;
-- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
-- 10xxxxxx
else
Skip_UTF_Byte;
if (U and 2#11111000#) = 2#11110_000# then
return;
-- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
-- 10xxxxxx 10xxxxxx
-- 10xxxxxx
else
Skip_UTF_Byte;
if (U and 2#11111100#) = 2#111110_00# then
return;
-- Any other code is invalid, note that this includes:
-- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
-- 10xxxxxx 10xxxxxx
-- 10xxxxxx 10xxxxxx
-- since Wide_Wide_Character does not allow codes
-- greater than 16#03FF_FFFF#
else
Bad;
end if;
end if;
end if;
end if;
end if;
end UTF8;
-- Special efficient encoding for brackets case
elsif Encoding_Method = WCEM_Brackets then
Brackets : declare
P : Natural;
S : Natural;
begin
-- See if we have "] at end positions
if Ptr > Input'First + 1
and then Input (Ptr - 1) = ']'
and then Input (Ptr - 2) = '"'
then
P := Ptr - 2;
-- Loop back looking for [" at start
while P >= Ptr - 10 loop
if P <= Input'First + 1 then
Bad;
elsif Input (P - 1) = '"'
and then Input (P - 2) = '['
then
-- Found ["..."], scan forward to check it
S := P - 2;
P := S;
Next_Wide_Wide_Character (Input, P);
-- OK if at original pointer, else error
if P = Ptr then
Ptr := S;
return;
else
Bad;
end if;
end if;
P := P - 1;
end loop;
-- Falling through loop means more than 8 chars between the
-- enclosing brackets (or simply a missing left bracket)
Bad;
-- Here if no bracket sequence present
else
if Ptr = Input'First then
Past_End;
else
Ptr := Ptr - 1;
end if;
end if;
end Brackets;
-- Non-UTF-8/Brackets. These are the inefficient cases where we have to
-- go to the start of the string and skip forwards till Ptr matches.
else
Non_UTF8_Brackets : declare
Discard : Wide_Wide_Character;
PtrS : Natural;
PtrP : Natural;
begin
PtrS := Input'First;
if Ptr <= PtrS then
Past_End;
end if;
loop
PtrP := PtrS;
Decode_Wide_Wide_Character (Input, PtrS, Discard);
if PtrS = Ptr then
Ptr := PtrP;
return;
elsif PtrS > Ptr then
Bad;
end if;
end loop;
exception
when Constraint_Error =>
Bad;
end Non_UTF8_Brackets;
end if;
end Prev_Wide_Wide_Character;
--------------------------
-- Validate_Wide_String --
--------------------------
function Validate_Wide_String (S : String) return Boolean is
Ptr : Natural;
begin
Ptr := S'First;
while Ptr <= S'Last loop
Next_Wide_Character (S, Ptr);
end loop;
return True;
exception
when Constraint_Error =>
return False;
end Validate_Wide_String;
-------------------------------
-- Validate_Wide_Wide_String --
-------------------------------
function Validate_Wide_Wide_String (S : String) return Boolean is
Ptr : Natural;
begin
Ptr := S'First;
while Ptr <= S'Last loop
Next_Wide_Wide_Character (S, Ptr);
end loop;
return True;
exception
when Constraint_Error =>
return False;
end Validate_Wide_Wide_String;
end GNAT.Decode_String;