Using UTF-8 Datastreams in cURL, when is the datastream converted from UTF-8 to ASCII?

Question

When using cURL to take character streams off of the internet, when is the datastream converted from a multibyte datatype to a single byte character array?

I wrote a program here, which appears to work using ASCII in the callback function.

However, I wrote another program that uses UTF-8 with wchar_t datatypes, which also appears to work. The datastream does not appear to differentiate between the two datatypes, even though a wchar_t type is 4 bytes on my machine and a char is 1 byte.

I guess that there is some sort of type conversion going on transparent to this program, but I am not aware of it (I think that in UTF-8 ASCII characters still take 1 byte of memory, but when a program uses wchar_t datatypes the system pads regular ascii characters with zeros converting them to 4 bytes, but this was not something the programmer implemented...).

#include "multicurl.h"

#define MAX_WAIT_MSECS 5*1000 /* Wait max. 5 seconds */

/*  The largest difference between the ASCII and UTF-8 variations of this program is that this callback function is now dealing with an array of wchar_t blocks rather than chars which are always 1 byte long, but it still works the same basic way. */
static size_t write_callback(wchar_t *ptr, size_t size, size_t nmemb, void *userdata){// cURL callback function [read in datastream to memory]
    // This prototype is provided by cURL, with an argument at the end for our data structure.
    // This function is repeatedly called by cURL until there is no more data in the data stream; *ptr [it is assumed cURL handles memory management for this pointer].
    
    size_t realsize = nmemb * size;// The number of bytes in the datastream [there is no NULL char]
    MemType *mem = (MemType *)userdata;
    wchar_t *tmp = realloc(mem->memory, mem->size  + realsize + sizeof(wchar_t) );// We add 1 wchar_t unit for the NULL character.

    if (tmp == NULL){
        printf("Not Enough Memory, realloc returned NULL.\n");
        exit(EXIT_FAILURE);
    }

    mem->memory = tmp;
    memcpy(&(mem->memory[ mem->size /  sizeof(wchar_t) ]), ptr, realsize );// Starting at the last element copy in datastream [it overwrites the last element]
    mem->size += realsize;// The actual size, in bytes, is realsize + ( 1 * sizeof(wchar_t) ), however realsize gives us the location of the last element.
    mem->memory[ mem->size / sizeof(wchar_t) ] = 0;// The datastream doesn't include a NULL character, so we zeroize the last element.
    // We overwrite the NULL character {the zeroized element} on the next callback iteration, if any.

    return (size * nmemb);// cURL crosschecks the datastream with this return value.
}

void *SetUpCurlHandle(CURLM * mh, wchar_t *utf8_url, MemType *output){
// Take in a multi handle pointer address, a URL and a struct pointer address, set up the curl easy handle and add it to the multi handle.

    /* Convert our UTF-8 URL string to a regular ASCII URL string. */
    char* url = (char*) malloc ( wcslen( utf8_url ) + 1 );
    wcstombs(url, utf8_url, wcslen( utf8_url ) * sizeof( wchar_t ) );

    CURL *hnd = NULL;
    output->memory = malloc( sizeof( wchar_t ) );              // Initialize the memory component of the structure.
    output->size = 0;                                           // Initialize the size component of the structure.

    // Initialize the cURL handle.
    hnd = curl_easy_init();

    if(hnd){

        // Setup the cURL options.
        curl_easy_setopt(hnd, CURLOPT_BUFFERSIZE, 102400L);
        curl_easy_setopt(hnd, CURLOPT_URL, url);// Set the request URL
        curl_easy_setopt(hnd, CURLOPT_NOPROGRESS, 1L);
        curl_easy_setopt(hnd, CURLOPT_USERAGENT, "curl/7.80.0");
        curl_easy_setopt(hnd, CURLOPT_MAXREDIRS, 50L);
        curl_easy_setopt(hnd, CURLOPT_HTTP_VERSION, (long)CURL_HTTP_VERSION_2TLS);
        curl_easy_setopt(hnd, CURLOPT_FTP_SKIP_PASV_IP, 1L);
        curl_easy_setopt(hnd, CURLOPT_TCP_KEEPALIVE, 1L);
        curl_easy_setopt(hnd, CURLOPT_WRITEFUNCTION, write_callback);// The callback function to write data to.
        curl_easy_setopt(hnd, CURLOPT_WRITEDATA, (void *)output);// Send the address of the data struct to callback func.
        //curl_easy_setopt(hnd, CURLOPT_VERBOSE, 1);

        curl_multi_add_handle(mh, hnd);
    }else{
        output->memory[0] = '\0';
    }    
    return NULL;// The output struct was passed by reference no need to return anything.
}

CURLM *SetUpMultiCurlHandle(){
    curl_global_init(CURL_GLOBAL_ALL);

    CURLM * mh = curl_multi_init();
    return mh;
}

void *PerformMultiCurl(CURLM * mh) 
/*Take in a preset multi handle, request data from the remote server asynchronously {it's assumed cURL is using threads transparent to the calling program}.
   Remove the handles from memory.*/
{
    CURLMsg *msg=NULL;
    CURL *hnd = NULL;
    CURLcode return_code = 0;
    int still_running = 0;
    int msgs_left = 0;

    curl_multi_perform(mh, &still_running);// Perform the requests.
    do {
        int numfds=0;
        int res = curl_multi_wait(mh, NULL, 0, MAX_WAIT_MSECS, &numfds);
        if(res != CURLM_OK) {
            fprintf(stderr, "error: curl_multi_wait() returned %d\n", res);
            return NULL;
        }
        curl_multi_perform(mh, &still_running);
        
       /* Without this loop the program will proceed to the next statement, most likely before the messages are retrieved from the server.
           The easy handle requests are conducted asynchronously, but one multi handle request is obviously conducted sequentially (can use pthreads to make asynchronous multi requests).*/
    } while(still_running); 
    
    
    /* This portion of the code will clean up and remove the handles from memory, you could change this to make them more persistent */
    while ((msg = curl_multi_info_read(mh, &msgs_left))) {
        if (msg->msg == CURLMSG_DONE) {
            hnd = msg->easy_handle;

            return_code = msg->data.result;
            if(return_code!=CURLE_OK) {
                fprintf(stderr, "CURL error code: %d\n", msg->data.result);
                continue;
            }

            curl_multi_remove_handle(mh, hnd);
            curl_easy_cleanup(hnd);
            hnd = NULL;
        }
        else {
            fprintf(stderr, "error: after curl_multi_info_read(), CURLMsg=%d\n", msg->msg);
        }
    }

    curl_multi_cleanup(mh);
    curl_global_cleanup();
    return NULL;
}

The full UTF-8 variation of this program can be found here

UTF-8 does not specify a datatype. Maybe a better way to ask this would be why is it I can download bitstreams of an unknown datatype and chop them up into char or wchar_ts without concern for the length of each character? ..And yet my program still translates them properly. — NorthStar, Jan 10 '22 at 17:17
This was never a concern when using chars, but it is a concern when using wchar_ts. It is also a concern when using UTF-8 because some characters are longer than 1 byte, but not all UTF-8 characters. — NorthStar, Jan 10 '22 at 17:26
The problem was that I needed a UTF-8 parser. It was unrelated to libcurl or multicurl. See the answers below for a UTF-8 parser. — NorthStar, Jan 11 '22 at 19:35

ikegami · Accepted Answer · 2022-01-10T17:58:27.897

1

As you would expect, it doesn't work. libcurl has no way to know the function expects a wchar_t* when it should expect a char*

If you inspect MyOutputStruct1.memory[0], you'll find it doesn't contain what it should. For example, when requesting https://stackoverflow.com, it contains 0x4f44213c. This is obviously wrong since that's far outside the range of valid Code Points. This is actually the first four Code Points (<!DO) jammed into one wchar_t (in LE order).

It kind of appears to work because of a second bug. When printing a wide string, you need to use %ls, not %s.

wprintf(L"Output:\n%s\n", MyOutputStruct1.memory);

should be

printf("Output:\n%ls\n", MyOutputStruct1.memory);
// -or-
wprintf(L"Output:\n%ls\n", MyOutputStruct1.memory);

Basically, the code expects a char* throughout. The pointer's type is wchar_t*, but it's used as a char* everywhere. As such, the two bugs mostly "cancel out" in the program in question. (I didn't look, but I expect a problem with inputs with a length that isn't divisible by sizeof(wchar_t).) If the pointer had actually be used as a wchar_t* (e.g. if it's elements had been inspected or if it had been passed to a w function), the problem would have been obvious.

edited Jan 10 '22 at 17:58

answered Jan 10 '22 at 17:16

ikegami

367,544
15
269
518

I suppose then when getting bitstreams from the internet, to always store them in 1 byte datatypes and convert them afterwards to some usable format. – NorthStar Jan 10 '22 at 18:03
Not necessarily. If there's a way to ask libcurl to decode the input into wchar_t for you, great! (For good reasons, this is unlikely to exist for libcurl, though.) – ikegami Jan 10 '22 at 18:05
Do note that it would be easier to decode it once you have received the entire thing. Otherwise, you have to deal with the situation where the encoding of a Code Point is split across two chunks. You would need to buffer the extra bytes for later decoding. It would look something [like this](https://gist.github.com/ikegami/e173f30fa6035fcf846b85ebee9e7d48). – ikegami Jan 10 '22 at 18:12
It appears as though you can just store any arbitrary encoding within a char array, so long as you decode it properly. For example, this is a string of wide characters stored in a char array, which prints properly if decoded properly, but as expected prints garbage when I try to print it as a char string: `char * string = L"риветHello World!";` `wprintf(L"\n%ls\n", string);` `printf("\n%s\n",string);` `printf("\n%c\n",string[0]);` – NorthStar Jan 10 '22 at 19:29
In light of this I suppose I should have just looked for a UTF-8 parser, then split the bit stream into wchar_t after the transfer. – NorthStar Jan 10 '22 at 19:37
If you only need ascii characters all of this is for nothing, just use char, but if you want to handle multibyte unicode this would be important [ascii and utf-8 are exactly the same for the same characters]. – NorthStar Jan 10 '22 at 20:19

NorthStar · Answer 2 · 2022-01-11T03:01:29.467

As was stated in the comment section all this really needed was a UTF-8 parser. Chars can hold UTF-8 but we cannot easily address each character individually without converting them to some other datatype [some UTF-8 characters are larger than 1 byte]. So I wrote a parser with the help of libutf-8.

/* gcc unicode.c -o unicode -lutf-8 
This program makes use of libutf-8.
http://www.whizkidtech.redprince.net/i18n/
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <locale.h>
#include <utf-8.h>

int* parse_UTF8_bitstream( size_t *len, const char *input_stream )
/* Parse a UTF-8 char bytestream into a 4-byte wide integer bytestream [so we can address each UTF-8 character individually] */
{
    *len = 0; // This will give us the number of wide-characters not counting NULL.
    int i = 0;
    int n;
    unsigned int *output = malloc ( sizeof( unsigned int ) );
    unsigned int *temp;
    while ( input_stream[ i ] ){
        temp = ( unsigned int* ) realloc(output, (*len + 1) * sizeof( unsigned int ) );
        output = temp;
        output[ *len ] = sgetu8( (unsigned char *) &input_stream[ i ], &n );
        i+= n; //Skip this many chars to the next UTF-8 code.
        *len = *len + 1;
    }
    
    /* Make sure the last character is NULL */
    temp = ( unsigned int* ) realloc(output, (*len + 1) * sizeof( unsigned int ) );
    output = temp;
    output[ *len ] = 0;
    
    return (int*)output; // This is our wide character string.
}

void process_string(const char *s)
{
    printf("%s\n", s);
    printf("LENGTH: %lu 1-Byte Characters\n\n", strlen( s ));
  
    size_t len;
    int* outputstream = parse_UTF8_bitstream( &len, s );
    
    printf("\n%ls\n", outputstream);
    printf("LENGTH: %lu Wide Characters\n", len);
    for(int i = 0; i<len; i++){
        printf("%lc\n", outputstream[ i ]);
    }
    
    free ( outputstream );
}


int main(void)
/* "Hello World", in Arabic, Russian, and Greek */
{
    setlocale(LC_ALL, "");
    
    const char *string1 = "مرحبا بالعالم";
    const char *string2 = "Всем привет";
    const char *string3 = "Γεια σου κόσμε";
    process_string( string1 );
    process_string( string2 );
    process_string( string3 );
          
    return 0;
}

NorthStar · Answer 3 · 2022-01-22T19:17:28.663

This is the same program I posted previously, however, it does not require any special libraries. It uses the mbtowc() function which is in the stdlib.

From the mbtowc() man page:

#include <stdlib.h>

int mbtowc(wchar_t * restrict wcharp, const char * restrict mbchar, size_t nbytes);

If mbchar is NULL, the mbtowc() function returns nonzero if shift states are supported, zero otherwise.

Otherwise, if mbchar is not a null pointer, mbtowc() either returns 0 if mbchar represents the null wide character, or returns the number of bytes processed in mbchar, or returns -1 if no multibyte character could be recognized or converted. In this case, mbtowc()'s internal conversion state is undefined.

/* cc unicode.c -o unicode  */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <locale.h>


int parse_UTF8_bitstream(wchar_t **output_stream, const char *input_stream )
/*  Parse a UTF-8 char bytestream into a 4-byte wide wchar_t bytestream 
    [so we can address each UTF-8 character individually] 
    If this parser receives invalid UTF-8 code it will return -1. 
*/
{
    int len = 0; /* This will give us the number of wide-characters not counting NULL. */
    int i = 0; /* This iterates through the mb char stream. */
    int skip_value;
    int wc_size = sizeof( wchar_t ); /* The size of our destination datatype. */
    
    /* Initialize the output_stream */
    output_stream[ 0 ] = malloc ( 1 );
    wchar_t *temp;
    
    while ( input_stream[ i ] ){
        temp = ( wchar_t* ) realloc( output_stream[ 0 ], (len + 1) * wc_size );
        output_stream[ 0 ] = temp;
      
        skip_value = mbtowc( &output_stream[ 0 ][ len ], &input_stream[ i ], wc_size );
        if (skip_value == -1) return -1;
        
        /* i skips this many chars to the next UTF-8 code. */
        i += skip_value;
        len = len + 1;
    }
    
    /* Make sure the last wide-character is NULL */
    temp = ( wchar_t* ) realloc( output_stream[ 0 ], (len + 1) * wc_size );
    output_stream[ 0 ] = temp;
    output_stream[ 0 ][ len ] = 0;
    
    return len; /* This is the length of the wide character string. */
}

void process_string(const char *s)
{
    printf("\n%s\n", s);
    printf("LENGTH: %lu 1-Byte Characters\n\n", strlen( s ));
  
    wchar_t* outputstream = NULL;
    size_t len = parse_UTF8_bitstream( &outputstream, s );
    
    if( len == -1 ) { 
        printf("\nThe parser received invalid unicode.\n");
        free ( outputstream ); 
        exit ( EXIT_FAILURE ); 
    }
    
    printf("%ls\n", outputstream);
    printf("LENGTH: %lu 4-Byte Wide-Characters\n", len);
    
    for(int i = 0; i<len; i++){
        printf("%lc\n", outputstream[ i ]);
    }
    
    free ( outputstream );
}


int main ( void )
/*  "Hello World" in Arabic, Russian, Greek, Georgian,
    Japanese, Chinese, and Korean.  
    
    I added an emoji string as an illustration [these 
    appear to be larger than 4 bytes each, they're probably 
    multiple unicode scalar values combined into a glyph].
    
    For Asian and emoji characters to work you need appropriate fonts.
    The noto-2.0 meta pkg on FreeBSD installs Asian and emoji fonts, 
    it's also available on Linux.  
*/
{
    setlocale(LC_ALL, "");
    
    const char *string1 = "مرحبا بالعالم";
    const char *string2 = "Всем привет";
    const char *string3 = "Γεια σου κόσμε";
    const char *string4 = "გამარჯობა სამყაროვ";
    const char *string5 = "ハローワールド";
    const char *string6 = "世界您好";
    const char *string7 = "전 세계 여러분 안녕하세요";
    const char *string8 = "️️️️";
    process_string( string1 );
    process_string( string2 );
    process_string( string3 );
    process_string( string4 );
    process_string( string5 );
    process_string( string6 );
    process_string( string7 ); 
    process_string( string8 );
             
    return 0;
    /* Don't use this code to violate anyone. */
}

Using UTF-8 Datastreams in cURL, when is the datastream converted from UTF-8 to ASCII?

3 Answers3

Linked