When using cURL to take character streams off of the internet, when is the datastream converted from a multibyte datatype to a single byte character array?
I wrote a program here, which appears to work using ASCII in the callback function.
However, I wrote another program that uses UTF-8 with wchar_t datatypes, which also appears to work. The datastream does not appear to differentiate between the two datatypes, even though a wchar_t type is 4 bytes on my machine and a char is 1 byte.
I guess that there is some sort of type conversion going on transparent to this program, but I am not aware of it (I think that in UTF-8 ASCII characters still take 1 byte of memory, but when a program uses wchar_t datatypes the system pads regular ascii characters with zeros converting them to 4 bytes, but this was not something the programmer implemented...).
#include "multicurl.h"
#define MAX_WAIT_MSECS 5*1000 /* Wait max. 5 seconds */
/* The largest difference between the ASCII and UTF-8 variations of this program is that this callback function is now dealing with an array of wchar_t blocks rather than chars which are always 1 byte long, but it still works the same basic way. */
static size_t write_callback(wchar_t *ptr, size_t size, size_t nmemb, void *userdata){// cURL callback function [read in datastream to memory]
// This prototype is provided by cURL, with an argument at the end for our data structure.
// This function is repeatedly called by cURL until there is no more data in the data stream; *ptr [it is assumed cURL handles memory management for this pointer].
size_t realsize = nmemb * size;// The number of bytes in the datastream [there is no NULL char]
MemType *mem = (MemType *)userdata;
wchar_t *tmp = realloc(mem->memory, mem->size + realsize + sizeof(wchar_t) );// We add 1 wchar_t unit for the NULL character.
if (tmp == NULL){
printf("Not Enough Memory, realloc returned NULL.\n");
exit(EXIT_FAILURE);
}
mem->memory = tmp;
memcpy(&(mem->memory[ mem->size / sizeof(wchar_t) ]), ptr, realsize );// Starting at the last element copy in datastream [it overwrites the last element]
mem->size += realsize;// The actual size, in bytes, is realsize + ( 1 * sizeof(wchar_t) ), however realsize gives us the location of the last element.
mem->memory[ mem->size / sizeof(wchar_t) ] = 0;// The datastream doesn't include a NULL character, so we zeroize the last element.
// We overwrite the NULL character {the zeroized element} on the next callback iteration, if any.
return (size * nmemb);// cURL crosschecks the datastream with this return value.
}
void *SetUpCurlHandle(CURLM * mh, wchar_t *utf8_url, MemType *output){
// Take in a multi handle pointer address, a URL and a struct pointer address, set up the curl easy handle and add it to the multi handle.
/* Convert our UTF-8 URL string to a regular ASCII URL string. */
char* url = (char*) malloc ( wcslen( utf8_url ) + 1 );
wcstombs(url, utf8_url, wcslen( utf8_url ) * sizeof( wchar_t ) );
CURL *hnd = NULL;
output->memory = malloc( sizeof( wchar_t ) ); // Initialize the memory component of the structure.
output->size = 0; // Initialize the size component of the structure.
// Initialize the cURL handle.
hnd = curl_easy_init();
if(hnd){
// Setup the cURL options.
curl_easy_setopt(hnd, CURLOPT_BUFFERSIZE, 102400L);
curl_easy_setopt(hnd, CURLOPT_URL, url);// Set the request URL
curl_easy_setopt(hnd, CURLOPT_NOPROGRESS, 1L);
curl_easy_setopt(hnd, CURLOPT_USERAGENT, "curl/7.80.0");
curl_easy_setopt(hnd, CURLOPT_MAXREDIRS, 50L);
curl_easy_setopt(hnd, CURLOPT_HTTP_VERSION, (long)CURL_HTTP_VERSION_2TLS);
curl_easy_setopt(hnd, CURLOPT_FTP_SKIP_PASV_IP, 1L);
curl_easy_setopt(hnd, CURLOPT_TCP_KEEPALIVE, 1L);
curl_easy_setopt(hnd, CURLOPT_WRITEFUNCTION, write_callback);// The callback function to write data to.
curl_easy_setopt(hnd, CURLOPT_WRITEDATA, (void *)output);// Send the address of the data struct to callback func.
//curl_easy_setopt(hnd, CURLOPT_VERBOSE, 1);
curl_multi_add_handle(mh, hnd);
}else{
output->memory[0] = '\0';
}
return NULL;// The output struct was passed by reference no need to return anything.
}
CURLM *SetUpMultiCurlHandle(){
curl_global_init(CURL_GLOBAL_ALL);
CURLM * mh = curl_multi_init();
return mh;
}
void *PerformMultiCurl(CURLM * mh)
/*Take in a preset multi handle, request data from the remote server asynchronously {it's assumed cURL is using threads transparent to the calling program}.
Remove the handles from memory.*/
{
CURLMsg *msg=NULL;
CURL *hnd = NULL;
CURLcode return_code = 0;
int still_running = 0;
int msgs_left = 0;
curl_multi_perform(mh, &still_running);// Perform the requests.
do {
int numfds=0;
int res = curl_multi_wait(mh, NULL, 0, MAX_WAIT_MSECS, &numfds);
if(res != CURLM_OK) {
fprintf(stderr, "error: curl_multi_wait() returned %d\n", res);
return NULL;
}
curl_multi_perform(mh, &still_running);
/* Without this loop the program will proceed to the next statement, most likely before the messages are retrieved from the server.
The easy handle requests are conducted asynchronously, but one multi handle request is obviously conducted sequentially (can use pthreads to make asynchronous multi requests).*/
} while(still_running);
/* This portion of the code will clean up and remove the handles from memory, you could change this to make them more persistent */
while ((msg = curl_multi_info_read(mh, &msgs_left))) {
if (msg->msg == CURLMSG_DONE) {
hnd = msg->easy_handle;
return_code = msg->data.result;
if(return_code!=CURLE_OK) {
fprintf(stderr, "CURL error code: %d\n", msg->data.result);
continue;
}
curl_multi_remove_handle(mh, hnd);
curl_easy_cleanup(hnd);
hnd = NULL;
}
else {
fprintf(stderr, "error: after curl_multi_info_read(), CURLMsg=%d\n", msg->msg);
}
}
curl_multi_cleanup(mh);
curl_global_cleanup();
return NULL;
}
The full UTF-8 variation of this program can be found here