Update 2022 NOV 20
If anybody is still here, wrangling with utf8 and C++. Apparently, after years of waiting, we have the first leaves of utf8 spring (in November 2022). Please see Tom's comment below.
I assume in a few short months' time all "3" will have utf8 implemented. With a bit of luck and C++23 will finally contain a full implementation of utf8. As advertised, way back with C++11.
Just checked cl.exe on this machine (19.34.31933) has no standard <cuchar>
, fully implemented.
Update 2022 APR 19
// with warnings but prints ok on LINUX
// g++ prog.cc -Wall -Wextra -std=c++2a
//
// clang++ prog.cc -Wall -Wextra -std=c++2a
// lot of warnings but prints OK on LINUX
//
#include <cassert>
#include <clocale>
#include <cstdio>
#include <cstdlib> // MB_CUR_MAX
#include <cuchar>
#undef P_
#undef P
#define P_(F_, X_) printf("\n%4d : %32s => " F_, __LINE__, #X_, (X_))
#define P(F_, X_) P_(F_, X_)
/*
using mbstate_t = ... see description ...
using size_t = ... see description ...
in the standard but not implemented yet by any of the three
size_t mbrtoc8(char8_t* pc8, const char* s, size_t n, mbstate_t* ps);
size_t c8rtomb(char* s, char8_t c8, mbstate_t* ps);
*/
namespace {
constexpr inline auto bad_size = ((size_t)-1);
// https://en.wikipedia.org/wiki/UTF-8
// a compile time constant not intrinsic function
constexpr inline int UTF8_CHAR_MAX_BYTES = 4;
#ifdef STANDARD_CUCHAR_IMPLEMENTED
template <size_t N>
auto char_star(const char8_t (&in)[N]) noexcept {
mbstate_t state;
constexpr static int out_size = (UTF8_CHAR_MAX_BYTES * N) + 1;
struct {
char data[out_size];
} out = {{0}};
char* one_char = out.data;
for (size_t rc, n = 0; n < N; ++n) {
rc = c8rtomb(one_char, in[n], &state);
if (rc == bad_size) break;
one_char += rc;
}
return out;
}
#endif // STANDARD_CUCHAR_IMPLEMENTED
template <size_t N>
auto char_star(const char16_t (&in)[N]) noexcept {
mbstate_t state;
constexpr static int out_size = (UTF8_CHAR_MAX_BYTES * N) + 1;
struct final {
char data[out_size];
} out = {{0}};
char* one_char = out.data;
for (size_t rc, n = 0; n < N; ++n) {
rc = c16rtomb(one_char, in[n], &state);
if (rc == bad_size) break;
one_char += rc;
}
return out;
}
template <size_t N>
auto char_star(const char32_t (&in)[N]) noexcept {
mbstate_t state;
constexpr static int out_size = (UTF8_CHAR_MAX_BYTES * N) + 1;
struct final {
char data[out_size];
} out = {{0}};
char* one_char = out.data;
for (size_t rc, n = 0; n < N; ++n) {
rc = c32rtomb(one_char, in[n], &state);
if (rc == bad_size) break;
one_char += rc;
}
return out;
}
} // namespace
#define KATAKANA "片仮名"
#define KATAKANA8 u8"片仮名"
#define KATAKANA16 u"片仮名"
#define KATAKANA32 U"片仮名"
int main(void) {
P("%s", KATAKANA); // const char *
// lot of warnings but ok output
P("%s", KATAKANA8); // const char8_t *
/*
garbled or no output
P( "%s", KATAKANA16 ); // const char16_t *
P( "%s" , KATAKANA32 ); // const char32_t *
*/
setlocale(LC_ALL, "en_US.utf8");
// no can do as there is no standard <cuchar> yet
// P( "%s", char_star(KATAKANA8).data ); // const char8_t *
P("%s", char_star(KATAKANA16).data); // const char16_t *
P("%s", char_star(KATAKANA32).data); // const char32_t *
}
Update 2021 MAR 19
Few things have (not) happened. __STDC_UTF_8__
is no more and <cuchar>
is still not implemented by any of "the Three".
Probably much better code matching this thread is HERE.
Update 2020 MAR 17
std::c8rtomb
and std::mbrtoc8
are not yet provided.
2019 NOV
std::c8rtomb
and std::mbrtoc8
are not yet provided, by the future C++20 ready compilers made by "The 3", to enable the conversion between the execution encoding and UTF-8. They are described in the C++20 standard.
It might be subjective, but c8rtomb()
is not an "awkward" interface, to me.
WANDBOX
// g++ prog.cc -std=gnu++2a
// clang++ prog.cc -std=c++2a
#include <stdio.h>
#include <clocale>
#ifndef __clang__
#include <cuchar>
#else
// clang has no <cuchar>
#include <uchar.h>
#endif
#include <climits>
template<size_t N>
void u32sample( const char32_t (&str32)[N] )
{
#ifndef __clang__
std::mbstate_t state{};
#else
mbstate_t state{};
#endif
char out[MB_LEN_MAX]{};
for(char32_t const & c : str32)
{
#ifndef __clang__
/*std::size_t rc =*/ std::c32rtomb(out, c, &state);
#else
/* std::size_t rc =*/ ::c32rtomb(out, c, &state);
#endif
printf("%s", out ) ;
}
}
#ifdef __STDC_UTF_8__
template<size_t N>
void u8sample( const char8_t (& str8)[N])
{
std::mbstate_t state{};
char out[MB_LEN_MAX]{};
for(char8_t const & c : str8)
{
/* std::size_t rc = */ std::c8rtomb(out, c, &state);
printf("%s", out ) ;
}
}
#endif // __STDC_UTF_8__
int main () {
std::setlocale(LC_ALL, "en_US.utf8");
#ifdef __linux__
printf("\nLinux like OS, ") ;
#endif
printf(" Compiler %s\n", __VERSION__ ) ;
printf("\nchar32_t *, Converting to 'char *', and then printing --> " ) ;
u32sample( U"ひらがな" ) ;
#ifdef __STDC_UTF_8__
printf("\nchar8_t *, Converting to 'char *', and then printing --> " ) ;
u8sample( u8"ひらがな" ) ;
#else
printf("\n\n__STDC_UTF_8__ is not defined, can not use char8_t");
#endif
printf("\n\nDone ..." ) ;
return 42;
}
I have commented out and documented, lines which do not compile as of today.