As shown here, @DeeDee has identified a bug in the implementation of Convert
for UTF_16
to UTF_8
. The problem arises in byte three of the four byte value for code points in the range U+10000
to U+10FFFF
, shown here. The source documents the relevant bit fields:
-- Codes in the range 16#10000# - 16#10FFFF#
-- UTF-16: 110110zzzzyyyyyy 110111yyxxxxxxxx
-- UTF-8: 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
-- Note: zzzzz in the output is input zzzz + 1
Byte three is constructed as follows:
Result (Len + 3) :=
Character'Val
(2#10_000000# or Shift_Left (yyyyyyyy and 2#1111#, 4)
or Shift_Right (xxxxxxxx, 6));
While the low four bits of yyyyyyyy
are used to construct byte three, the value only needs to be shifted two places left to make room for the top two bits of xxxxxxxx
. The correct formulation should be this:
Result (Len + 3) :=
Character'Val
(2#10_000000# or Shift_Left (yyyyyyyy and 2#1111#, 2)
or Shift_Right (xxxxxxxx, 6));
For reference, the complete example below recapitulates the original implementation, with enough additions to study the problem in isolation. The output shows the code point, the expected binary representation of the UTF-8
encoding, the conversion to UTF-16
, the incorrect UTF-8
conversion, and the correct UTF-8
conversion.
Codepoint: 16#1D11E#
UTF-8: 4: 2#11110000# 2#10011101# 2#10000100# 2#10011110#
UTF-16: 2: 2#1101100000110100# 2#1101110100011110#
UTF-8: 4: 2#11110000# 2#10011101# 2#10010000# 2#10011110#
UTF-8: 4: 2#11110000# 2#10011101# 2#10000100# 2#10011110#
OK
Code:
-- https://stackoverflow.com/q/62564638/230513
with Ada.Text_IO; use Ada.Text_IO;
with Ada.Integer_Text_IO; use Ada.Integer_Text_IO;
with Ada.Strings.UTF_Encoding; use Ada.Strings.UTF_Encoding;
with Ada.Strings.UTF_Encoding.Conversions;
use Ada.Strings.UTF_Encoding.Conversions;
with Ada.Strings.UTF_Encoding.Wide_Wide_Strings;
use Ada.Strings.UTF_Encoding.Wide_Wide_Strings;
with Interfaces; use Interfaces;
with Unchecked_Conversion;
procedure UTFTest is
-- http://www.fileformat.info/info/unicode/char/1d11e/index.htm
Clef : constant Wide_Wide_String :=
(1 => Wide_Wide_Character'Val (16#1D11E#));
Str_8 : constant UTF_8_String := Encode (Clef);
Str_16 : constant UTF_16_Wide_String := Convert (Str_8);
Str_8_New : constant UTF_8_String := Convert (Str_16);
My_Str_8 : UTF_8_String := Convert (Str_16);
function To_Unsigned_16 is new Unchecked_Conversion (Wide_Character,
Interfaces.Unsigned_16);
procedure Raise_Encoding_Error (Index : Natural) is
Val : constant String := Index'Img;
begin
raise Encoding_Error
with "bad input at Item (" & Val (Val'First + 1 .. Val'Last) & ')';
end Raise_Encoding_Error;
function My_Convert (Item : UTF_16_Wide_String;
Output_BOM : Boolean := False) return UTF_8_String
is
Result : UTF_8_String (1 .. 3 * Item'Length + 3);
-- Worst case is 3 output codes for each input code + BOM space
Len : Natural;
-- Number of result codes stored
Iptr : Natural;
-- Pointer to next input character
C1, C2 : Unsigned_16;
zzzzz : Unsigned_16;
yyyyyyyy : Unsigned_16;
xxxxxxxx : Unsigned_16;
-- Components of double length case
begin
Iptr := Item'First;
-- Skip BOM at start of input
if Item'Length > 0 and then Item (Iptr) = BOM_16 (1) then
Iptr := Iptr + 1;
end if;
-- Generate output BOM if required
if Output_BOM then
Result (1 .. 3) := BOM_8;
Len := 3;
else
Len := 0;
end if;
-- Loop through input
while Iptr <= Item'Last loop
C1 := To_Unsigned_16 (Item (Iptr));
Iptr := Iptr + 1;
-- Codes in the range 16#0000# - 16#007F#
-- UTF-16: 000000000xxxxxxx
-- UTF-8: 0xxxxxxx
if C1 <= 16#007F# then
Result (Len + 1) := Character'Val (C1);
Len := Len + 1;
-- Codes in the range 16#80# - 16#7FF#
-- UTF-16: 00000yyyxxxxxxxx
-- UTF-8: 110yyyxx 10xxxxxx
elsif C1 <= 16#07FF# then
Result (Len + 1) :=
Character'Val (2#110_00000# or Shift_Right (C1, 6));
Result (Len + 2) :=
Character'Val (2#10_000000# or (C1 and 2#00_111111#));
Len := Len + 2;
-- Codes in the range 16#800# - 16#D7FF# or 16#E000# - 16#FFFF#
-- UTF-16: yyyyyyyyxxxxxxxx
-- UTF-8: 1110yyyy 10yyyyxx 10xxxxxx
elsif C1 <= 16#D7FF# or else C1 >= 16#E000# then
Result (Len + 1) :=
Character'Val (2#1110_0000# or Shift_Right (C1, 12));
Result (Len + 2) :=
Character'Val
(2#10_000000# or (Shift_Right (C1, 6) and 2#00_111111#));
Result (Len + 3) :=
Character'Val (2#10_000000# or (C1 and 2#00_111111#));
Len := Len + 3;
-- Codes in the range 16#10000# - 16#10FFFF#
-- UTF-16: 110110zzzzyyyyyy 110111yyxxxxxxxx
-- UTF-8: 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
-- Note: zzzzz in the output is input zzzz + 1
elsif C1 <= 2#110110_11_11111111# then
if Iptr > Item'Last then
Raise_Encoding_Error (Iptr - 1);
else
C2 := To_Unsigned_16 (Item (Iptr));
Iptr := Iptr + 1;
end if;
if (C2 and 2#111111_00_00000000#) /= 2#110111_00_00000000# then
Raise_Encoding_Error (Iptr - 1);
end if;
zzzzz := (Shift_Right (C1, 6) and 2#1111#) + 1;
yyyyyyyy :=
((Shift_Left (C1, 2) and 2#111111_00#) or
(Shift_Right (C2, 8) and 2#000000_11#));
xxxxxxxx := C2 and 2#11111111#;
Result (Len + 1) :=
Character'Val (2#11110_000# or (Shift_Right (zzzzz, 2)));
Result (Len + 2) :=
Character'Val
(2#10_000000# or Shift_Left (zzzzz and 2#11#, 4) or
Shift_Right (yyyyyyyy, 4));
Result (Len + 3) :=
Character'Val
(2#10_000000# or Shift_Left (yyyyyyyy and 2#1111#, 2) or
Shift_Right (xxxxxxxx, 6));
Result (Len + 4) :=
Character'Val (2#10_000000# or (xxxxxxxx and 2#00_111111#));
Len := Len + 4;
-- Error if input in 16#DC00# - 16#DFFF# (2nd surrogate with no 1st)
else
Raise_Encoding_Error (Iptr - 2);
end if;
end loop;
return Result (1 .. Len);
end My_Convert;
procedure Show (S : String) is
begin
Put(" UTF-8: ");
Put (S'Length, 1);
Put (":");
for C of S loop
Put (Character'Pos (C), 12, 2);
end loop;
New_Line;
end Show;
procedure Show (S : Wide_String) is
begin
Put("UTF-16: ");
Put (S'Length, 1);
Put (":");
for C of S loop
Put (Wide_Character'Pos (C), 20, 2);
end loop;
New_Line;
end Show;
begin
Put ("Codepoint:");
Put (Wide_Wide_Character'Pos (Clef (1)), 10, 16);
New_Line;
Show (Str_8);
Show (Str_16);
Show (Str_8_New);
My_Str_8 := My_Convert (Str_16);
Show (My_Str_8);
if Str_8 = My_Str_8 then
Put_Line ("OK");
else
Put_Line ("Bug");
end if;
end UTFTest;
See also Bug 95953 / Bug 95959.