None of the answers posted so far will correctly transcode an input string containing codepoints from the full Unicode range like for example "UnicodeSupportest♀️♂️❤️♀️".
That's why I wrote the following function, using only VBA inbuilt functions/statements that are available on both, Windows and MacOS.
This function works cross-platform and cross-app, and for the entire Unicode range.
codePoints > 65535
are also supported, even though VBAs inbuilt ChrW()
and AscW
don't support them, because the transcoding is done entirely "manually", including surrogate pairs. Performance should also be relatively good since the function works on a single byte-array buffer. If someone finds a bug or improvement, please let me know!
This code was improved as a result of this answer on CodeReview, many thanks to Cristian Buse for that!
'Function transcoding an UTF-8 encoded string to the VBA-native UTF-16-LE
'Author: Guido Witt-Dörring, https://stackoverflow.com/a/75787820/12287457
' https://github.com/guwidoe/VBA-StringTools
Public Function DecodeUTF8(ByVal utf8str As String, _
Optional ByVal raiseErrors As Boolean = False) As String
Const methodName As String = "DecodeUTF8"
Dim i As Long, j As Long, k As Long, numBytesOfCodePoint As Byte
Static numBytesOfCodePoints(0 To 255) As Byte
Static mask(2 To 4) As Long, minCp(2 To 4) As Long
If numBytesOfCodePoints(0) = 0 Then
For i = &H0& To &H7F&: numBytesOfCodePoints(i) = 1: Next i '0xxxxxxx
'110xxxxx - C0 and C1 are invalid (overlong encoding)
For i = &HC2& To &HDF&: numBytesOfCodePoints(i) = 2: Next i
For i = &HE0& To &HEF&: numBytesOfCodePoints(i) = 3: Next i '1110xxxx
'11110xxx - 11110100, 11110101+ (= &HF5+) outside of valid Unicode range
For i = &HF0& To &HF4&: numBytesOfCodePoints(i) = 4: Next i
For i = 2 To 4: mask(i) = (2 ^ (7 - i) - 1): Next i
minCp(2) = &H80&: minCp(3) = &H800&: minCp(4) = &H10000
End If
Dim utf8() As Byte, utf16() As Byte, codePoint As Long, currByte As Byte
utf8 = utf8str
ReDim utf16(0 To (UBound(utf8) - LBound(utf8) + 1) * 2)
i = LBound(utf8): j = 0
Do While i <= UBound(utf8)
codePoint = utf8(i)
numBytesOfCodePoint = numBytesOfCodePoints(codePoint)
If numBytesOfCodePoint = 0 Then
If raiseErrors Then Err.Raise 5, methodName, "Invalid byte"
GoTo insertErrChar
ElseIf numBytesOfCodePoint = 1 Then
utf16(j) = codePoint
j = j + 2
ElseIf i + numBytesOfCodePoint - 1 > UBound(utf8) Then
If raiseErrors Then _
Err.Raise 5, methodName, _
"Incomplete UTF-8 codepoint at end of string."
GoTo insertErrChar
Else
codePoint = utf8(i) And mask(numBytesOfCodePoint)
For k = 1 To numBytesOfCodePoint - 1
currByte = utf8(i + k)
If (currByte And &HC0&) = &H80& Then
codePoint = (codePoint * &H40&) + (currByte And &H3F)
Else
If raiseErrors Then _
Err.Raise 5, methodName, "Invalid continuation byte"
GoTo insertErrChar
End If
Next k
'Convert the Unicode codepoint to UTF-16LE bytes
If codePoint < minCp(numBytesOfCodePoint) Then
If raiseErrors Then _
Err.Raise 5, methodName, "Overlong encoding"
GoTo insertErrChar
ElseIf codePoint < &HD800& Then
utf16(j) = CByte(codePoint And &HFF&)
utf16(j + 1) = CByte(codePoint \ &H100&)
j = j + 2
ElseIf codePoint < &HE000& Then
If raiseErrors Then _
Err.Raise 5, methodName, _
"Invalid Unicode codepoint.(Range reserved for surrogate pairs)"
GoTo insertErrChar
ElseIf codePoint < &H10000 Then
If codePoint = &HFEFF& Then GoTo nextCp '(BOM - will be ignored)
utf16(j) = codePoint And &HFF&
utf16(j + 1) = codePoint \ &H100&
j = j + 2
ElseIf codePoint < &H110000 Then 'Calculate surrogate pair
Dim m As Long, lowSurrogate As Long, highSurrogate As Long
m = codePoint - &H10000 '(m \ &H400&) =most sign. 10 bits of m
highSurrogate = &HD800& Or (m \ &H400&)
lowSurrogate = &HDC00& Or (m And &H3FF) 'least sig. 10 bits of m
utf16(j) = highSurrogate And &HFF&
utf16(j + 1) = highSurrogate \ &H100&
utf16(j + 2) = lowSurrogate And &HFF&
utf16(j + 3) = lowSurrogate \ &H100&
j = j + 4
Else
If raiseErrors Then _
Err.Raise 5, methodName, _
"Codepoint outside of valid Unicode range"
insertErrChar: utf16(j) = &HFD: utf16(j + 1) = &HFF: j = j + 2
If numBytesOfCodePoint = 0 Then numBytesOfCodePoint = 1
End If
End If
nextCp: i = i + numBytesOfCodePoint 'Move to the next UTF-8 codepoint
Loop
DecodeUTF8 = MidB$(utf16, 1, j)
End Function