Use this char encoding function with only one parameter

Question

After many search, I found the perfect function for my need here

Here is the code :

/* UTF-8 to ISO-8859-1/ISO-8859-15 mapper.
 * Return 0..255 for valid ISO-8859-15 code points, 256 otherwise.
*/
static inline unsigned int to_latin9(const unsigned int code)
{
    //printf("\ncode = %d", code);
    /* Code points 0 to U+00FF are the same in both. */
    if (code < 256U) {
        return code;
    } 
    switch (code) {
    case 0x0152U: return 188U; /* U+0152 = 0xBC: OE ligature */
    case 0x0153U: return 189U; /* U+0153 = 0xBD: oe ligature */
    case 0x0160U: return 166U; /* U+0160 = 0xA6: S with caron */
    case 0x0161U: return 168U; /* U+0161 = 0xA8: s with caron */
    case 0x0178U: return 190U; /* U+0178 = 0xBE: Y with diaresis */
    case 0x017DU: return 180U; /* U+017D = 0xB4: Z with caron */
    case 0x017EU: return 184U; /* U+017E = 0xB8: z with caron */
    case 0x20ACU: return 164U; /* U+20AC = 0xA4: Euro */
    default:      return 256U;
    }
}

/* Convert an UTF-8 string to ISO-8859-15.
 * All invalid sequences are ignored.
 * Note: output == input is allowed,
 * but   input < output < input + length
 * is not.
 * Output has to have room for (length+1) chars, including the trailing NUL byte.
 */
size_t utf8_to_latin9(char *const output, const char *const input, const size_t length)
{
    unsigned char             *out = (unsigned char *)output;
    const unsigned char       *in  = (const unsigned char *)input;
    const unsigned char *const end = (const unsigned char *)input + length;
    unsigned int               c;

    while (in < end)
        if (*in < 128)
            *(out++) = *(in++); /* Valid codepoint */
        else
        if (*in < 192)
            in++;               /* 10000000 .. 10111111 are invalid */
        else
        if (*in < 224) {        /* 110xxxxx 10xxxxxx */
            if (in + 1 >= end)
                break;
            if ((in[1] & 192U) == 128U) {
                c = to_latin9( (((unsigned int)(in[0] & 0x1FU)) << 6U)
                             |  ((unsigned int)(in[1] & 0x3FU)) );
                if (c < 256)
                    *(out++) = c;
            }
            in += 2;

        } else
        if (*in < 240) {        /* 1110xxxx 10xxxxxx 10xxxxxx */
            if (in + 2 >= end)
                break;
            if ((in[1] & 192U) == 128U &&
                (in[2] & 192U) == 128U) {
                c = to_latin9( (((unsigned int)(in[0] & 0x0FU)) << 12U)
                             | (((unsigned int)(in[1] & 0x3FU)) << 6U)
                             |  ((unsigned int)(in[2] & 0x3FU)) );
                if (c < 256)
                    *(out++) = c;
            }
            in += 3;

        } else
        if (*in < 248) {        /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
            if (in + 3 >= end)
                break;
            if ((in[1] & 192U) == 128U &&
                (in[2] & 192U) == 128U &&
                (in[3] & 192U) == 128U) {
                c = to_latin9( (((unsigned int)(in[0] & 0x07U)) << 18U)
                             | (((unsigned int)(in[1] & 0x3FU)) << 12U)
                             | (((unsigned int)(in[2] & 0x3FU)) << 6U)
                             |  ((unsigned int)(in[3] & 0x3FU)) );
                if (c < 256)
                    *(out++) = c;
            }
            in += 4;

        } else
        if (*in < 252) {        /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
            if (in + 4 >= end)
                break;
            if ((in[1] & 192U) == 128U &&
                (in[2] & 192U) == 128U &&
                (in[3] & 192U) == 128U &&
                (in[4] & 192U) == 128U) {
                c = to_latin9( (((unsigned int)(in[0] & 0x03U)) << 24U)
                             | (((unsigned int)(in[1] & 0x3FU)) << 18U)
                             | (((unsigned int)(in[2] & 0x3FU)) << 12U)
                             | (((unsigned int)(in[3] & 0x3FU)) << 6U)
                             |  ((unsigned int)(in[4] & 0x3FU)) );
                if (c < 256)
                    *(out++) = c;
            }
            in += 5;

        } else
        if (*in < 254) {        /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
            if (in + 5 >= end)
                break;
            if ((in[1] & 192U) == 128U &&
                (in[2] & 192U) == 128U &&
                (in[3] & 192U) == 128U &&
                (in[4] & 192U) == 128U &&
                (in[5] & 192U) == 128U) {
                c = to_latin9( (((unsigned int)(in[0] & 0x01U)) << 30U)
                             | (((unsigned int)(in[1] & 0x3FU)) << 24U)
                             | (((unsigned int)(in[2] & 0x3FU)) << 18U)
                             | (((unsigned int)(in[3] & 0x3FU)) << 12U)
                             | (((unsigned int)(in[4] & 0x3FU)) << 6U)
                             |  ((unsigned int)(in[5] & 0x3FU)) );
                if (c < 256)
                    *(out++) = c;
            }
            in += 6;

        } else
            in++;               /* 11111110 and 11111111 are invalid */

    /* Terminate the output string. */
    *out = '\0';

    return (size_t)(out - (unsigned char *)output);
}

This work perfectly.

But this function take 1 buffer_input and return me a second buffer_output. For optimize it, I would like use only one buffer_input and the function return me the buffer_input modified.

I don't know if it's possible. I have not enough knowledge for doing it myself (I have trouble to understanding this code).

It is hard to edit this code for what I want to achieve ?

For the moment I use like this :

utf8_to_latin9(buffer_output, buffer_input, length)

After edit I would like use like this :

utf8_to_latin9(buffer_input, buffer_input, length)

or

utf8_to_latin9(buffer_input, length)

I am so tired. Didn't see the `utf8_to_latin9` function. Deleted my answer. sorry. — bolov, Jul 26 '16 at 12:23
@DevSolar Thats what the function `to_latin9` does, it filter the 8 char which are not longer in latin1 and repalce it by the char in latin9. — Jax Teller, Jul 26 '16 at 12:29
Not commenting on the conversion itself, but in terms of the memory accesses for any such algorithm, just think about it: If, as here, `in` is always ahead of or equal to `out`, you can just write back to the input buffer, without risking reading new (post-update) data when you want old. Make `out` a pointer into the same buffer instead of a separate one, and adjust the function signature and some minor details of the body accordingly. This isn't difficult. — underscore_d, Jul 26 '16 at 12:31
Deleted my previous comments to put the finger right on the point where your code breaks: If the UTF-8 input contains valid Latin1 characters that are not also valid Latin9 characters (any one of `¤¦¨´¸¼½¾`), they will be "converted" to their Latin9 counterparts with the same value (instead of giving a `256` conversion error). Both `\u00a4` and `\u20ac` will yield the same output, `0xa4`, which is correct for the latter but in error for the former. -- I recommend the [ICU library](http://icu-project.org/apiref/icu4c/ucnv_8h.html) for all your Unicode needs instead of trying to roll your own. — DevSolar, Jul 26 '16 at 12:46
1. Why not use an existing conversion library? 2. Why optimize small things? — n. m. could be an AI, Jul 26 '16 at 13:13
@n.m. 1.because as you said, it need a library. 2. I'm working in embedded computing and use 1 buffer instead of 2 is not negligible. — Jax Teller, Jul 26 '16 at 13:17
if you want to understand you just need to know [how UTF-8 encoding works](https://en.wikipedia.org/wiki/UTF-8). It's really simple — phuclv, Jul 26 '16 at 13:32
Then you may want to do a more accurate conversion. Unicode characters may arrive decomposed, in which case you will lose accents. — n. m. could be an AI, Jul 26 '16 at 15:09

DevSolar · Accepted Answer · 2016-07-26T14:58:51.973

I have not corrected (or checked) your code for any problems, I just modified it to work in-place. Note my comment regarding the faulty conversion in utf8_to_latin9(), for one.

The conversion to in-place operation is really simple:

Switch to one buffer parameter, make out point at the beginning of the input buffer (instead of a separate output buffer):

< size_t utf8_to_latin9(char *const output, const char *const input, const size_t length)
< {
<     unsigned char             *out = (unsigned char *)output;
----
> size_t utf8_to_latin9(char *const input, const size_t length)
> {
>     unsigned char             *out = (unsigned char *)input;

At the end of the conversion, calculate the return value based on the input buffer (instead of the separate output buffer):

< return (size_t)(out - (unsigned char *)output);
----
> return (size_t)(out - (unsigned char *)input);

Since there is no way the UTF-8 input buffer could be consumed at a slower rate than the Latin9 output buffer is being filled, this is quite safe.

My test main():

int main()
{
    char input[256] = {0};
                    /*  ¤ *//*    €   *//* some kanji */
    strcpy( input, "\xc2\xa4\xe2\x82\xac\xf0\xaf\xa4\xa9" );
    utf8_to_latin9( input, 9 );
    printf( "%hhx ", input[0] );
    printf( "%hhx ", input[1] );
    printf( "%hhx ", input[2] );
    printf( "%hhx ", input[3] );
    printf( "%hhx ", input[4] );
    printf( "%hhx ", input[5] );
    printf( "%hhx ", input[6] );
    printf( "%hhx ", input[7] );
    printf( "%hhx\n", input[8] );
    return 0;
}

Output:

a4 a4 0 82 ac f0 af a4 a9

That is what I would expect (showcasing the conversion problem I commented upon).

it is the same then write `utf8_to_latin9(char *const input, const char *const input, const size_t)` and it doesn"t work, i don't know why — Jax Teller, Jul 26 '16 at 13:41
@A.Rossi: "It does not work" is not, never was, and never will be a correct failure analysis. Your code lacks a `main()` feeding test input to the function. I wrote one, and it succeeded in meeting the expected output, so I consider the case closed. If you have further problem, please include a MCVE, observed output, and expected output. — DevSolar, Jul 26 '16 at 13:46
@A.Rossi: Added my test code. Please state how your results and / or expectations differ. — DevSolar, Jul 26 '16 at 13:52
It already works in place as is, see the comment `Note: output == input is allowed`. — n. m. could be an AI, Jul 26 '16 at 15:01

Use this char encoding function with only one parameter

1 Answers1