It is easier to read bytes if you use hex. #255#254#49#0#100#0
is represented in hex as
FF FE 31 00 64 00
Where
FF FE
is the UTF-16LE BOM, which identifies the following bytes as being encoded as UTF-16 using values in Little Endian.
31 00
is the ASCII character '1'
64 00
is the ASCII character 'd'
.
To create a WideString
containing these bytes is very easy:
var
W: WideString;
S: String;
begin
S := '1d';
W := WideChar($FEFF) + S;
end;
When an AnsiString
(which is Delphi 6's default string type) is assigned to a WideString
, the RTL automatically converts the AnsiString
data from 8-bit to UTF-16LE using the local machine's default Ansi charset for the conversion.
Going the other way is just as easy:
var
W: WideString;
S: String;
begin
W := WideChar($FEFF) + '1d';
S := Copy(W, 2, MaxInt);
end;
When you assign a WideString
to an AnsiString
, the RTL automatically converts the WideString
data from UTF-16LE to 8-bit using the default Ansi charset.
If the default Ansi charset is not suitable for your needs (say the 8-bit data needs to be encoded in a different charset), you will have to use the Win32 API MultiByteToWideChar()
and WideCharToMultiByte()
functions directly (or 3rd party library with equivalent functionality) so you can specify the desired charset/codepage as needed.
Now then, Delphi 6 does not offer any useful helpers to read Unicode files (Delphi 2009 and later do), so you will have to do it yourself manually, for example:
function ReadUnicodeFile(const FileName: string): WideString;
const
cBOM_UTF8: array[0..2] of Byte = ($EF, $BB, $BF);
cBOM_UTF16BE: array[0..1] of Byte = ($FE, $FF);
cBOM_UTF16LE: array[0..1] of Byte = ($FF, $FE);
cBOM_UTF32BE: array[0..3] of Byte = ($00, $00, $FE, $FF);
cBOM_UTF32LE: array[0..3] of Byte = ($FF, $FE, $00, $00);
var
FS: TFileStream;
BOM: array[0..3] of Byte;
NumRead: Integer;
U8: UTF8String;
U32: UCS4String;
I: Integer;
begin
Result := '';
FS := TFileStream.Create(FileName, fmOpenRead or fmShareDenyWrite);
try
NumRead := FS.Read(BOM, 4);
// UTF-8
if (NumRead >= 3) and CompareMem(@BOM, @cBOM_UTF8, 3) then
begin
if NumRead > 3 then
FS.Seek(-(NumRead-3), soCurrent);
SetLength(U8, FS.Size - FS.Position);
if Length(U8) > 0 then
begin
FS.ReadBuffer(PAnsiChar(U8)^, Length(U8));
Result := UTF8Decode(U8);
end;
end
// the UTF-16LE and UTF-32LE BOMs are ambiguous! Check for UTF-32 first...
// UTF-32
else if (NumRead = 4) and (CompareMem(@BOM, cBOM_UTF32LE, 4) or CompareMem(@BOM, cBOM_UTF32BE, 4)) then
begin
// UCS4String is not a true string type, it is a dynamic array, so
// it must include room for a null terminator...
SetLength(U32, ((FS.Size - FS.Position) div SizeOf(UCS4Char)) + 1);
if Length(U32) > 1 then
begin
FS.ReadBuffer(PUCS4Chars(U32)^, (Length(U32) - 1) * SizeOf(UCS4Char));
if CompareMem(@BOM, cBOM_UTF32BE, 4) then
begin
for I := Low(U32) to High(U32) do
begin
U32[I] := ((U32[I] and $000000FF) shl 24) or
((U32[I] and $0000FF00) shl 8) or
((U32[I] and $00FF0000) shr 8) or
((U32[I] and $FF000000) shr 24);
end;
end;
U32[High(U32)] := 0;
// Note: UCS4StringToWidestring() does not actually support UTF-16,
// only UCS-2! If you need to handle UTF-16 surrogates, you will
// have to convert from UTF-32 to UTF-16 manually, there is no RTL
// or Win32 function that will do it for you...
Result := UCS4StringToWidestring(U32);
end;
end
// UTF-16
else if (NumRead >= 2) and (CompareMem(@BOM, cBOM_UTF16LE, 2) or CompareMem(@BOM, cBOM_UTF16BE, 2)) then
begin
if NumRead > 2 then
FS.Seek(-(NumRead-2), soCurrent);
SetLength(Result, (FS.Size - FS.Position) div SizeOf(WideChar));
if Length(Result) > 0 then
begin
FS.ReadBuffer(PWideChar(Result)^, Length(Result) * SizeOf(WideChar));
if CompareMem(@BOM, cBOM_UTF16BE, 2) then
begin
for I := 1 to Length(Result) then
begin
Result[I] := WideChar(
((Word(Result[I]) and $00FF) shl 8) or
((Word(Result[I]) and $FF00) shr 8)
);
end;
end;
end;
end
// something else, assuming UTF-8
else
begin
if NumRead > 0 then
FS.Seek(-NumRead, soCurrent);
SetLength(U8, FS.Size - FS.Position);
if Length(U8) > 0 then
begin
FS.ReadBuffer(PAnsiChar(U8)^, Length(U8));
Result := UTF8Decode(U8);
end;
end;
finally
FS.Free;
end;
end;
Update: if you want to store UTF-16LE encoded bytes inside of an AnsiString
variable (why?), then you can Move()
the raw bytes of a WideString
's character data into the memory block of an AnsiString
: eg:
function WideStringAsAnsi(const AValue: WideString): AnsiString;
begin
SetLength(Result, Length(AValue) * SizeOf(WideChar));
Move(PWideChar(AValue)^, PAnsiChar(Result)^, Length(Result));
end;
var
W: WideString;
S: AnsiString;
begin
W := WideChar($FEFF) + '1d';
S := WideStringAsAnsi(W);
end;
I would not suggest misusing AnsiString
like this, though. If you need bytes, operate on bytes, eg:
type
TBytes = array of Byte;
function WideStringAsBytes(const AValue: WideString): TBytes;
begin
SetLength(Result, Length(AValue) * SizeOf(WideChar));
Move(PWideChar(AValue)^, PByte(Result)^, Length(Result));
end;
var
W: WideString;
B: TBytes;
begin
W := WideChar($FEFF) + '1d';
B := WideStringAsBytes(W);
end;