blob: 6887baf04a8dbbc346016f75f539636ef946151f [file] [log] [blame]
------------------------------------------------------------------------------
-- --
-- GNAT COMPILER COMPONENTS --
-- --
-- G N A T . B Y T E _ O R D E R _ M A R K --
-- --
-- B o d y --
-- --
-- Copyright (C) 2006-2022, AdaCore --
-- --
-- GNAT is free software; you can redistribute it and/or modify it under --
-- terms of the GNU General Public License as published by the Free Soft- --
-- ware Foundation; either version 3, or (at your option) any later ver- --
-- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. --
-- --
-- As a special exception under Section 7 of GPL version 3, you are granted --
-- additional permissions described in the GCC Runtime Library Exception, --
-- version 3.1, as published by the Free Software Foundation. --
-- --
-- You should have received a copy of the GNU General Public License and --
-- a copy of the GCC Runtime Library Exception along with this program; --
-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
-- <http://www.gnu.org/licenses/>. --
-- --
-- GNAT was originally developed by the GNAT team at New York University. --
-- Extensive contributions were provided by Ada Core Technologies Inc. --
-- --
------------------------------------------------------------------------------
package body GNAT.Byte_Order_Mark is
--------------
-- Read_BOM --
--------------
procedure Read_BOM
(Str : String;
Len : out Natural;
BOM : out BOM_Kind;
XML_Support : Boolean := False)
is
begin
-- Note: the order of these tests is important, because in some cases
-- one sequence is a prefix of a longer sequence, and we must test for
-- the longer sequence first
-- UTF-32 (big-endian)
if Str'Length >= 4
and then Str (Str'First) = Character'Val (16#00#)
and then Str (Str'First + 1) = Character'Val (16#00#)
and then Str (Str'First + 2) = Character'Val (16#FE#)
and then Str (Str'First + 3) = Character'Val (16#FF#)
then
Len := 4;
BOM := UTF32_BE;
-- UTF-32 (little-endian)
elsif Str'Length >= 4
and then Str (Str'First) = Character'Val (16#FF#)
and then Str (Str'First + 1) = Character'Val (16#FE#)
and then Str (Str'First + 2) = Character'Val (16#00#)
and then Str (Str'First + 3) = Character'Val (16#00#)
then
Len := 4;
BOM := UTF32_LE;
-- UTF-16 (big-endian)
elsif Str'Length >= 2
and then Str (Str'First) = Character'Val (16#FE#)
and then Str (Str'First + 1) = Character'Val (16#FF#)
then
Len := 2;
BOM := UTF16_BE;
-- UTF-16 (little-endian)
elsif Str'Length >= 2
and then Str (Str'First) = Character'Val (16#FF#)
and then Str (Str'First + 1) = Character'Val (16#FE#)
then
Len := 2;
BOM := UTF16_LE;
-- UTF-8 (endian-independent)
elsif Str'Length >= 3
and then Str (Str'First) = Character'Val (16#EF#)
and then Str (Str'First + 1) = Character'Val (16#BB#)
and then Str (Str'First + 2) = Character'Val (16#BF#)
then
Len := 3;
BOM := UTF8_All;
-- UCS-4 (big-endian) XML only
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#00#)
and then Str (Str'First + 1) = Character'Val (16#00#)
and then Str (Str'First + 2) = Character'Val (16#00#)
and then Str (Str'First + 3) = Character'Val (16#3C#)
then
Len := 0;
BOM := UCS4_BE;
-- UCS-4 (little-endian) XML case
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#3C#)
and then Str (Str'First + 1) = Character'Val (16#00#)
and then Str (Str'First + 2) = Character'Val (16#00#)
and then Str (Str'First + 3) = Character'Val (16#00#)
then
Len := 0;
BOM := UCS4_LE;
-- UCS-4 (unusual byte order 2143) XML case
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#00#)
and then Str (Str'First + 1) = Character'Val (16#00#)
and then Str (Str'First + 2) = Character'Val (16#3C#)
and then Str (Str'First + 3) = Character'Val (16#00#)
then
Len := 0;
BOM := UCS4_2143;
-- UCS-4 (unusual byte order 3412) XML case
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#00#)
and then Str (Str'First + 1) = Character'Val (16#3C#)
and then Str (Str'First + 2) = Character'Val (16#00#)
and then Str (Str'First + 3) = Character'Val (16#00#)
then
Len := 0;
BOM := UCS4_3412;
-- UTF-16 (big-endian) XML case
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#00#)
and then Str (Str'First + 1) = Character'Val (16#3C#)
and then Str (Str'First + 2) = Character'Val (16#00#)
and then Str (Str'First + 3) = Character'Val (16#3F#)
then
Len := 0;
BOM := UTF16_BE;
-- UTF-32 (little-endian) XML case
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#3C#)
and then Str (Str'First + 1) = Character'Val (16#00#)
and then Str (Str'First + 2) = Character'Val (16#3F#)
and then Str (Str'First + 3) = Character'Val (16#00#)
then
Len := 0;
BOM := UTF16_LE;
-- Unrecognized special encodings XML only
elsif XML_Support
and then Str'Length >= 4
and then Str (Str'First) = Character'Val (16#3C#)
and then Str (Str'First + 1) = Character'Val (16#3F#)
and then Str (Str'First + 2) = Character'Val (16#78#)
and then Str (Str'First + 3) = Character'Val (16#6D#)
then
-- UTF-8, ASCII, some part of ISO8859, Shift-JIS, EUC,...
Len := 0;
BOM := Unknown;
-- No BOM recognized
else
Len := 0;
BOM := Unknown;
end if;
end Read_BOM;
end GNAT.Byte_Order_Mark;