As R.. already wrote, there is no build-in solution but I remembered that I did it in JavaScript a couple of years ago. A rough one-to-one translation to C:
#include <ctype.h>
#include <wchar.h>
// checks if it is a character resembling a digit
// between 0 (zero) and 9 (nine).
// Numbers (http://www.fileformat.info/info/unicode/category/No/list.htm)
// are not included.
// Restriction to 16-bit is arbitrary, no digits above at time of writing
// with a range of 0-9 (except e.g.: digits with a full stop or a comma following)
int is_utf16_digit(wchar_t w)
{
// a small shortcut
if (isdigit((int)w)) {
return 1;
}
// a bit of range sectioning for legibility
if (w <= 0x9ef) {
if ( (w >= 0x0660 && w <= 0x0669)
|| (w >= 0x06f0 && w <= 0x06f9) // EXTENDED ARABIC-INDIC DIGITs
|| (w >= 0x0966 && w <= 0x096f) // DEVANAGARI DIGITs
|| (w >= 0x09e6 && w <= 0x09ef) // BENGALI DIGITs
) {
return 1;
} else {
return 0;
}
} else if (w > 0x9ef && w <= 0xc6f) {
if ( (w >= 0x0a66 && w <= 0x0a6f) // GURMUKHI DIGITs
|| (w >= 0x0ae6 && w <= 0x0aef) // GUJARATI DIGITs
|| (w >= 0x0b66 && w <= 0x0b6f) // ORIYA DIGITs
|| (w >= 0x0be7 && w <= 0x0bef) // TAMIL DIGITs
|| (w >= 0x0c66 && w <= 0x0c6f) // TELUGU DIGITs
) {
return 1;
} else {
return 0;
}
} else if (w > 0xc6f && w <= 0xf29) {
if ( (w >= 0x0ce6 && w <= 0x0cef) // KANNADA DIGITs
|| (w >= 0x0d66 && w <= 0x0d6f) // MALAYALAM DIGITs
|| (w >= 0x0e50 && w <= 0x0e59) // THAI DIGITs
|| (w >= 0x0ed0 && w <= 0x0ed9) // LAO DIGITs
|| (w >= 0x0f20 && w <= 0x0f29) // TIBETAN DIGITs
) {
return 1;
} else {
return 0;
}
} else if (w > 0xf29 && w <= 0xff19) {
if ( (w >= 0x1040 && w <= 0x1049) // MYANMAR DIGITs
|| (w >= 0x1369 && w <= 0x1371) // ETHIOPIC DIGITs
|| (w >= 0x17e0 && w <= 0x17e9) // KHMER DIGITs
|| (w >= 0x1810 && w <= 0x1819) // MONGOLIAN DIGITs
/*
0x2460 - 0x2468 CIRCLED DIGITs 1-9
0x24ea CIRCLED DIGITs 0
0x2474 - 0x247c PARENTHESIZED DIGITs 1-9
0x2488 - 0x2490 DIGITs 1-9 FULL STOP
0x24f5 - 0x24fd DOUBLE CIRCLED DIGIT 1-9
and so on and so forth
*/
|| (w >= 0xff10 && w <= 0xff19) // FULLWIDTH DIGITs (meh?)
/*
First characters larger than 16 bits:
(0x10107 - 0x1010f AEGEAN NUMBERs 1-9)
0x102e1 - 0x102e9 COPTIC EPACT DIGITs 1-9
0x10a40 - 0x10a43 KHAROSHTHI DIGITs 1-4
0x10e60 - 0x10e68 RUMI DIGITs 1-9
Interestingly, the zero for DIGITs 1-9 FULL STOP is at 0x1f100.
Forgotten by the committee?
First consecutive digits are the digits with a comma:
0x1f101 - 0x1f10a
*/
) {
return 1;
} else {
return 0;
}
} else {
return 0;
}
}
#include <stdio.h>
#include <stdlib.h>
#include <locale.h>
#include <string.h>
int main(int argc, char **argv)
{
wchar_t *w;
size_t slen, wlen;
if (argc != 2) {
fprintf(stderr, "Usage: %s string\n", argv[0]);
exit(EXIT_FAILURE);
}
setlocale(LC_ALL, "");
slen = strlen(argv[1]);
w = malloc(slen * (sizeof(wchar_t) + 1));
if (w == NULL) {
fprintf(stderr, "malloc() failed to allocate %zu bytes\n", slen);
exit(EXIT_FAILURE);
}
wlen = mbstowcs(w, argv[1], slen);
if (wlen == (size_t) - 1) {
fprintf(stderr, "mbstowcs() failed\n");
exit(EXIT_FAILURE);
}
// adjusting memory size with a round of realloc() ommited
while (*w != L'\0') {
if (is_utf16_digit(*w)) {
printf("\"%lc\" (0x%04x) is a digit\n", *w, *w);
} else {
printf("\"%lc\" (0x%04x) is not a digit\n", *w, *w);
}
w++;
}
exit(EXIT_SUCCESS);
}
Example with some characters representing the digit zero from different languages
$ ./isutf16digit "asd0asd٠asd೯asd0asd"
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
"0" (0x0030) is a digit
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
"٠" (0x0660) is a digit
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
"೯" (0x0cef) is a digit
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
"0" (0xff10) is a digit
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
That together with iswalpha()
should do it but as it is always the case with Unicode and standard-C without a specialized lib: it's doable but it's nasty. See above, for example: the code for testing is quite complicated and it even assumes a lot to keep it simple.