This is not an answer, but a third, quite complex example, on how to use wide character I/O. This was too long to add to my actual answer to this question.
This example shows how to read and process CSV files (RFC-4180 format, optionally with limited backslash escape support) using wide strings.
The following code is CC0/public domain, so you are free to use it any way you like, even include in your own proprietary projects, but if it breaks anything, you get to keep all the bits and not complain to me. (I'll be happy to include any bug fixes if you find and report them in a comment below, though.)
The logic of the code is robust, however. In particular, it supports universal newlines, all four common newline types: Unix-like LF (\n
), old CR LF (\r\n
), old Mac CR (\r
), and the occasionally encountered weird LF CR (\n\r
). There are no built-in limitations wrt. the length of a field, the number of fields in a record, or the number of records in a file. It works very nicely if you need to convert CSV or process CSV input stream-like (field by field or record-by-record), without having to have more than one in memory at one point. If you want to construct structures to describe the records and fields in memory, you'll need to add some scaffolding code for that.
Because of universal newline support, when reading input interactively, this program might require two consecutive end-of-inputs (Ctrl+Z in Windows and MS-DOS, Ctrl+D everywhere else), as the first one is usually "consumed" by the csv_next_field()
or csv_skip_field()
function, and the csv_next_record()
function needs to re-read it again to actually detect it. However, you do not normally ask the user to input CSV data interactively, so this should be an acceptable quirk.
#include <stdlib.h>
#include <locale.h>
#include <string.h>
#include <stdio.h>
#include <wchar.h>
#include <wctype.h>
#include <errno.h>
/* RFC-4180 -format CSV file processing using wide input streams.
*
* #define BACKSLASH_ESCAPES if you additionally wish to have
* \\, \a, \b, \t, \n, \v, \f, \r, \", and \, de-escaped to their
* C string equivalents when reading CSV fields.
*/
typedef enum {
CSV_OK = 0,
CSV_END = 1,
CSV_INVALID_PARAMETERS = -1,
CSV_FORMAT_ERROR = -2,
CSV_CHARSET_ERROR = -3,
CSV_READ_ERROR = -4,
CSV_OUT_OF_MEMORY = -5,
} csv_status;
const char *csv_error(const csv_status code)
{
switch (code) {
case CSV_OK: return "No error";
case CSV_END: return "At end";
case CSV_INVALID_PARAMETERS: return "Invalid parameters";
case CSV_FORMAT_ERROR: return "Bad CSV format";
case CSV_CHARSET_ERROR: return "Illegal character in CSV file (incorrect locale?)";
case CSV_READ_ERROR: return "Read error";
case CSV_OUT_OF_MEMORY: return "Out of memory";
default: return "Unknown csv_status code";
}
}
/* Start the next record. Automatically skips any remaining fields in current record.
* Returns CSV_OK if successful, CSV_END if no more records, or a negative CSV_ error code. */
csv_status csv_next_record (FILE *const in);
/* Skip the next field. Returns CSV_OK if successful, CSV_END if no more fields in current record,
* or a negative CSV_ error code. */
csv_status csv_skip_field (FILE *const in);
/* Read the next field. Returns CSV_OK if successful, CSV_END if no more fields in current record,
* or a negative CSV_ error code.
* If this returns CSV_OK, then *dataptr is a dynamically allocated wide string to the field
* contents, space allocated for *sizeptr wide characters; and if lengthptr is not NULL, then
* *lengthptr is the number of wide characters in said wide string. */
csv_status csv_next_field (FILE *const in, wchar_t **const dataptr,
size_t *const sizeptr,
size_t *const lengthptr);
static csv_status internal_skip_quoted(FILE *const in)
{
while (1) {
wint_t wc;
errno = 0;
wc = fgetwc(in);
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_FORMAT_ERROR;
}
if (wc == L'"') {
errno = 0;
wc = fgetwc(in);
if (wc == L'"')
continue;
while (wc != WEOF && wc != L'\n' && wc != L'\r' && iswspace(wc)) {
errno = 0;
wc = fgetwc(in);
}
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
if (wc == L',') {
errno = 0;
return CSV_OK;
}
if (wc == L'\n' || wc == L'\r') {
ungetwc(wc, in);
errno = 0;
return CSV_END;
}
ungetwc(wc, in);
errno = 0;
return CSV_FORMAT_ERROR;
}
#ifdef BACKSLASH_ESCAPES
if (wc == L'\\') {
errno = 0;
wc = fgetwc(in);
if (wc == L'"')
continue;
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
}
#endif
}
}
static csv_status internal_skip_unquoted(FILE *const in, wint_t wc)
{
while (1) {
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
if (wc == L',') {
errno = 0;
return CSV_OK;
}
if (wc == L'\n' || wc == L'\r') {
ungetwc(wc, in);
errno = 0;
return CSV_END;
}
#ifdef BACKSLASH_ESCAPES
if (wc == L'\\') {
errno = 0;
wc = fgetwc(in);
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
}
#endif
errno = 0;
wc = fgetwc(in);
}
}
csv_status csv_next_record(FILE *const in)
{
while (1) {
wint_t wc;
csv_status status;
do {
errno = 0;
wc = fgetwc(in);
} while (wc != WEOF && wc != L'\n' && wc != L'\r' && iswspace(wc));
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
if (wc == L'\n' || wc == L'\r') {
wint_t next_wc;
errno = 0;
next_wc = fgetwc(in);
if (next_wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
if ((wc == L'\n' && next_wc == L'\r') ||
(wc == L'\r' && next_wc == L'\n')) {
errno = 0;
return CSV_OK;
}
ungetwc(next_wc, in);
errno = 0;
return CSV_OK;
}
if (wc == L'"')
status = internal_skip_quoted(in);
else
status = internal_skip_unquoted(in, wc);
if (status < 0)
return status;
}
}
csv_status csv_skip_field(FILE *const in)
{
wint_t wc;
if (!in) {
errno = EINVAL;
return CSV_INVALID_PARAMETERS;
} else
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
/* Skip leading whitespace. */
do {
errno = 0;
wc = fgetwc(in);
} while (wc != WEOF && wc != L'\n' && wc != L'\r' && iswspace(wc));
if (wc == L'"')
return internal_skip_quoted(in);
else
return internal_skip_unquoted(in, wc);
}
csv_status csv_next_field(FILE *const in, wchar_t **const dataptr,
size_t *const sizeptr,
size_t *const lengthptr)
{
wchar_t *data;
size_t size;
size_t used = 0; /* length */
wint_t wc;
if (lengthptr)
*lengthptr = 0;
if (!in || !dataptr || !sizeptr) {
errno = EINVAL;
return CSV_INVALID_PARAMETERS;
} else
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
if (*dataptr) {
data = *dataptr;
size = *sizeptr;
} else {
data = NULL;
size = 0;
*sizeptr = 0;
}
/* Skip leading whitespace. */
do {
errno = 0;
wc = fgetwc(in);
} while (wc != WEOF && wc != L'\n' && wc != L'\r' && iswspace(wc));
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_END;
}
if (wc == L'\n' || wc == L'\r') {
ungetwc(wc, in);
errno = 0;
return CSV_END;
}
if (wc == L'"')
while (1) {
errno = 0;
wc = getwc(in);
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
errno = 0;
return CSV_FORMAT_ERROR;
} else
if (wc == L'"') {
errno = 0;
wc = getwc(in);
if (wc != L'"') {
/* Not an escaped doublequote. */
while (wc != WEOF && wc != L'\n' && wc != L'\r' && iswspace(wc)) {
errno = 0;
wc = getwc(in);
}
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
} else
if (wc == L'\n' || wc == L'\r') {
ungetwc(wc, in);
} else
if (wc != L',') {
errno = 0;
return CSV_FORMAT_ERROR;
}
break;
}
#ifdef BACKSLASH_ESCAPES
} else
if (wc == L'\\') {
errno = 0;
wc = getwc(in);
if (wc == L'\0')
continue;
else
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
break;
} else
switch (wc) {
case L'a': wc = L'\a'; break;
case L'b': wc = L'\b'; break;
case L't': wc = L'\t'; break;
case L'n': wc = L'\n'; break;
case L'v': wc = L'\v'; break;
case L'f': wc = L'\f'; break;
case L'r': wc = L'\r'; break;
case L'\\': wc = L'\\'; break;
case L'"': wc = L'"'; break;
case L',': wc = L','; break;
default:
ungetwc(wc, in);
wc = L'\\';
}
#endif
}
if (used + 2 > size) {
/* Allocation policy.
* Anything that yields size >= used + 2 is acceptable.
* This one allocates in roughly 1024 byte chunks,
* and is known to be robust (but not optimal) in practice. */
size = (used | 1023) + 1009;
data = realloc(data, size * sizeof data[0]);
if (!data) {
errno = ENOMEM;
return CSV_OUT_OF_MEMORY;
}
*dataptr = data;
*sizeptr = size;
}
data[used++] = wc;
}
else
while (1) {
if (wc == L',')
break;
if (wc == L'\n' || wc == L'\r') {
ungetwc(wc, in);
break;
}
#ifdef BACKSLASH_ESCAPES
if (wc == L'\\') {
errno = 0;
wc = fgetwc(in);
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
wc = L'\\';
} else
switch (wc) {
case L'a': wc = L'\a'; break;
case L'b': wc = L'\b'; break;
case L't': wc = L'\t'; break;
case L'n': wc = L'\n'; break;
case L'v': wc = L'\v'; break;
case L'f': wc = L'\f'; break;
case L'r': wc = L'\r'; break;
case L'"': wc = L'"'; break;
case L',': wc = L','; break;
case L'\\': wc = L'\\'; break;
default:
ungetwc(wc, in);
wc = L'\\';
}
}
#endif
if (used + 2 > size) {
/* Allocation policy.
* Anything that yields size >= used + 2 is acceptable.
* This one allocates in roughly 1024 byte chunks,
* and is known to be robust (but not optimal) in practice. */
size = (used | 1023) + 1009;
data = realloc(data, size * sizeof data[0]);
if (!data) {
errno = ENOMEM;
return CSV_OUT_OF_MEMORY;
}
*dataptr = data;
*sizeptr = size;
}
data[used++] = wc;
errno = 0;
wc = getwc(in);
if (wc == WEOF) {
if (errno == EILSEQ)
return CSV_CHARSET_ERROR;
if (errno)
return CSV_READ_ERROR;
if (ferror(in)) {
errno = EIO;
return CSV_READ_ERROR;
}
break;
}
}
/* Ensure there is room for the end-of-string mark. */
if (used >= size) {
size = used + 1;
data = realloc(data, size * sizeof data[0]);
if (!data) {
errno = ENOMEM;
return CSV_OUT_OF_MEMORY;
}
*dataptr = data;
*sizeptr = size;
}
data[used] = L'\0';
if (lengthptr)
*lengthptr = used;
errno = 0;
return CSV_OK;
}
/* Helper function: print a wide string as if in quotes, but backslash-escape special characters.
*/
static void wquoted(FILE *const out, const wchar_t *ws, const size_t len)
{
if (out) {
size_t i;
for (i = 0; i < len; i++)
if (ws[i] == L'\0')
fputws(L"\\0", out);
else
if (ws[i] == L'\a')
fputws(L"\\a", out);
else
if (ws[i] == L'\b')
fputws(L"\\b", out);
else
if (ws[i] == L'\t')
fputws(L"\\t", out);
else
if (ws[i] == L'\n')
fputws(L"\\n", out);
else
if (ws[i] == L'\v')
fputws(L"\\v", out);
else
if (ws[i] == L'\f')
fputws(L"\\f", out);
else
if (ws[i] == L'\r')
fputws(L"\\r", out);
else
if (ws[i] == L'"')
fputws(L"\\\"", out);
else
if (ws[i] == L'\\')
fputws(L"\\\\", out);
else
if (iswprint(ws[i]))
fputwc(ws[i], out);
else
if (ws[i] < 65535)
fwprintf(out, L"\\x%04x", (unsigned int)ws[i]);
else
fwprintf(out, L"\\x%08x", (unsigned long)ws[i]);
}
}
static int show_csv(FILE *const in, const char *const filename)
{
wchar_t *field_contents = NULL;
size_t field_allocated = 0;
size_t field_length = 0;
unsigned long record = 0UL;
unsigned long field;
csv_status status;
while (1) {
/* First field in this record. */
field = 0UL;
record++;
while (1) {
status = csv_next_field(in, &field_contents, &field_allocated, &field_length);
if (status == CSV_END)
break;
if (status < 0) {
fprintf(stderr, "%s: %s.\n", filename, csv_error(status));
free(field_contents);
return -1;
}
field++;
wprintf(L"Record %lu, field %lu is \"", record, field);
wquoted(stdout, field_contents, field_length);
wprintf(L"\", %lu characters.\n", (unsigned long)field_length);
}
status = csv_next_record(in);
if (status == CSV_END) {
free(field_contents);
return 0;
}
if (status < 0) {
fprintf(stderr, "%s: %s.\n", filename, csv_error(status));
free(field_contents);
return -1;
}
}
}
static int usage(const char *argv0)
{
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help | /? ]\n", argv0);
fprintf(stderr, " %s CSV-FILE [ ... ]\n", argv0);
fprintf(stderr, "\n");
fprintf(stderr, "Use special file name '-' to read from standard input.\n");
fprintf(stderr, "\n");
return EXIT_SUCCESS;
}
int main(int argc, char *argv[])
{
FILE *in;
int arg;
setlocale(LC_ALL, "");
fwide(stdin, 1);
fwide(stdout, 1);
if (argc < 1)
return usage(argv[0]);
for (arg = 1; arg < argc; arg++) {
if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "--help") || !strcmp(argv[arg], "/?"))
return usage(argv[0]);
if (!strcmp(argv[arg], "-")) {
if (show_csv(stdin, "(standard input)"))
return EXIT_FAILURE;
} else {
in = fopen(argv[arg], "r");
if (!in) {
fprintf(stderr, "%s: %s.\n", argv[arg], strerror(errno));
return EXIT_FAILURE;
}
if (show_csv(in, argv[arg]))
return EXIT_FAILURE;
if (ferror(in)) {
fprintf(stderr, "%s: %s.\n", argv[arg], strerror(EIO));
fclose(in);
return EXIT_FAILURE;
}
if (fclose(in)) {
fprintf(stderr, "%s: %s.\n", argv[arg], strerror(EIO));
return EXIT_FAILURE;
}
}
}
return EXIT_SUCCESS;
}
The use of the above csv_next_field()
, csv_skip_field()
, and csv_next_record()
is quite straightforward.
Open the CSV file normally, then call fwide(stream, 1)
on it to tell the C library you intend to use the wide string variants instead of the standard narrow string I/O functions.
Create four variables, and initialize the first two:
wchar_t *field = NULL;
size_t allocated = 0;
size_t length;
csv_status status;
field
is a pointer to the dynamically allocated contents of each field you read. It is allocated automatically; essentially, you don't need to worry about it at all. allocated
holds the currently allocated size (in wide characters, including terminating L'\0'
), and we'll use length
and status
later.
At this point, you are ready to read or skip the first field in the first record.
You do not wish to call csv_next_record()
at this point, unless you wish to skip the very first record entirely in the file.
Call status = csv_skip_field(stream);
to skip the next field, or status = csv_next_field(stream, &field, &allocated, &length);
to read it.
If status == CSV_OK
, you have the field contents in wise string field
. It has length
wide characters in it.
If status == CSV_END
, there was no more fields in the current record. (The field
is unchanged, and you should not examine it.)
Otherwise, status < 0
, and it describes an error code. You can use csv_error(status)
to obtain a (narrow) string describing it.
At any point, you can move (skip) to the start of the next record by calling status = csv_next_record(stream);
.
If it returns CSV_OK
, there might be a new record available. (We only know when you try to read or skip the first field. This is similar to how standard C library function feof()
only tells you whether you have tried to read past the end of input, it does not tell whether there is more data available or not.)
If it returns CSV_END
, you already have processed the last record, and there are no more records.
Otherwise, it returns a negative error code, status < 0
. You can use csv_error(status)
to obtain a (narrow) string describing it.
After you are done, discard the field buffer:
free(field);
field = NULL;
allocated = 0;
You do not actually need to reset the variables to NULL
and zero, but I recommend it. In fact, you can do the above at any point (when you are no longer interested in the contents of the current field), as the csv_next_field()
will then automatically allocate a new buffer as necessary.
Note that free(NULL);
is always safe and does nothing. You do not need to check if field
is NULL
or not before freeing it. This is also the reason why I recommend initializing the variables immediately when you declare them. It just makes everything so much easier to handle.
The compiled example program takes one or more CSV file names as command-line parameters, then reads the files and reports the contents of each field in the file. If you have a particularly fiendishly complex CSV file, this is optimal for checking if this approach reads all the fields correctly.