I also managed to fix it by trailing/cutting down all Non-ASCII characters.
This one takes about 2.6
seconds to parse 319MB:
#include <stdlib.h>
#include <iostream>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
if( fixedreadline == NULL ) {
perror( "malloc fixedreadline" );
return -1;
}
char* source;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
int index;
int charsread;
int invalidcharsoffset;
unsigned int fixedchar;
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
invalidcharsoffset = 0;
for( index = 0; index < charsread; ++index )
{
fixedchar = static_cast<unsigned int>( readline[index] );
// std::cerr << "index " << std::setw(3) << index
// << " readline " << std::setw(10) << fixedchar
// << " -> '" << readline[index] << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
fixedreadline[index-invalidcharsoffset] = readline[index];
}
else {
++invalidcharsoffset;
}
}
fixedreadline[index-invalidcharsoffset] = '\0';
// std::cerr << "fixedreadline=" << fixedreadline << std::endl;
}
else {
break;
}
}
std::cerr << "fixedreadline=" << fixedreadline << std::endl;
free( readline );
free( fixedreadline );
fclose( cfilestream );
return 0;
}
Alternative and slower version using memcpy
Using menmove
does not improve much speed, so you could either one.
This one takes about 3.1
seconds to parse 319MB:
#include <stdlib.h>
#include <iostream>
#include <cstring>
#include <iomanip>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
if( fixedreadline == NULL ) {
perror( "malloc fixedreadline" );
return -1;
}
char* source;
char* destination;
char* finalresult;
int index;
int lastcopy;
int charsread;
int charstocopy;
int invalidcharsoffset;
bool hasignoredbytes;
unsigned int fixedchar;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
hasignoredbytes = false;
source = readline;
destination = fixedreadline;
lastcopy = 0;
invalidcharsoffset = 0;
for( index = 0; index < charsread; ++index )
{
fixedchar = static_cast<unsigned int>( readline[index] );
// std::cerr << "fixedchar " << std::setw(10)
// << fixedchar << " -> '"
// << readline[index] << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
if( hasignoredbytes ) {
charstocopy = index - lastcopy - invalidcharsoffset;
memcpy( destination, source, charstocopy );
source += index - lastcopy;
lastcopy = index;
destination += charstocopy;
invalidcharsoffset = 0;
hasignoredbytes = false;
}
}
else {
++invalidcharsoffset;
hasignoredbytes = true;
}
}
if( destination != fixedreadline ) {
charstocopy = charsread - static_cast<int>( source - readline )
- invalidcharsoffset;
memcpy( destination, source, charstocopy );
destination += charstocopy - 1;
if( *destination == '\n' ) {
*destination = '\0';
}
else {
*++destination = '\0';
}
finalresult = fixedreadline;
}
else {
finalresult = readline;
}
// std::cerr << "finalresult=" << finalresult << std::endl;
}
else {
break;
}
}
std::cerr << "finalresult=" << finalresult << std::endl;
free( readline );
free( fixedreadline );
fclose( cfilestream );
return 0;
}
Optimized solution using iconv
This takes about 4.6
seconds to parse 319MB of text.
#include <iconv.h>
#include <string.h>
#include <stdlib.h>
#include <iostream>
// Compile it with:
// g++ -o main test.cpp -O3 -liconv
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
char* fixedreadline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
if( fixedreadline == NULL ) {
perror( "malloc fixedreadline" );
return -1;
}
char* source;
char* destination;
int charsread;
size_t inchars;
size_t outchars;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
iconv_t conversiondescriptor = iconv_open("UTF-8//IGNORE", "UTF-8");
if( conversiondescriptor == (iconv_t)-1 ) {
perror( "iconv_open conversiondescriptor" );
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
source = readline;
inchars = charsread;
destination = fixedreadline;
outchars = charsread;
if( iconv( conversiondescriptor, &source, &inchars, &destination, &outchars ) )
{
perror( "iconv" );
}
// Trim out the new line character
if( *--destination == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "fixedreadline='" << fixedreadline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "fixedreadline='" << fixedreadline << "'" << std::endl;
free( readline );
free( fixedreadline );
if( fclose( cfilestream ) ) {
perror( "fclose cfilestream" );
}
if( iconv_close( conversiondescriptor ) ) {
perror( "iconv_close conversiondescriptor" );
}
return 0;
}
Slowest solution ever using mbtowc
This takes about 24.2
seconds to parse 319MB of text.
If you comment out the line fixedchar = mbtowc(NULL, source, charsread);
and uncomment the line charsread -= fixedchar;
(breaking the invalid characters removal) this will take 1.9
seconds instead of 24.2
seconds (also compiled with -O3
optimization level).
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <iomanip>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
char* source;
char* lineend;
char* destination;
int charsread;
int fixedchar;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
lineend = readline + charsread;
destination = readline;
for( source = readline; source != lineend; )
{
// fixedchar = 1;
fixedchar = mbtowc(NULL, source, charsread);
charsread -= fixedchar;
// std::ostringstream contents;
// for( int index = 0; index < fixedchar; ++index )
// contents << source[index];
// std::cerr << "fixedchar=" << std::setw(10)
// << fixedchar << " -> '"
// << contents.str().c_str() << "'" << std::endl;
if( fixedchar > 0 ) {
memmove( destination, source, fixedchar );
source += fixedchar;
destination += fixedchar;
}
else if( fixedchar < 0 ) {
source += 1;
// std::cerr << "errno=" << strerror( errno ) << std::endl;
}
else {
break;
}
}
// Trim out the new line character
if( *--destination == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "readline='" << readline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "readline='" << readline << "'" << std::endl;
if( fclose( cfilestream ) ) {
perror( "fclose cfilestream" );
}
free( readline );
return 0;
}
Fastest version from all my others above using memmove
You cannot use memcpy
here because the memory regions overlap!
This takes about 2.4
seconds to parse 319MB.
If you comment out the lines *destination = *source
and memmove( destination, source, 1 )
(breaking the invalid characters removal) the performance still almost the same as when memmove
is being called. Here in, calling memmove( destination, source, 1 )
is a little slower than directly doing *destination = *source;
#include <stdlib.h>
#include <iostream>
#include <cstring>
#include <iomanip>
int main(int argc, char const *argv[])
{
FILE* cfilestream = fopen( "./test.txt", "r" );
size_t linebuffersize = 131072;
if( cfilestream == NULL ) {
perror( "fopen cfilestream" );
return -1;
}
char* readline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
char* source;
char* lineend;
char* destination;
int charsread;
unsigned int fixedchar;
if( ( source = std::setlocale( LC_ALL, "en_US.utf8" ) ) == NULL ) {
perror( "setlocale" );
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
while( true )
{
if( ( charsread = getline( &readline, &linebuffersize, cfilestream ) ) != -1 )
{
lineend = readline + charsread;
destination = readline;
for( source = readline; source != lineend; ++source )
{
fixedchar = static_cast<unsigned int>( *source );
// std::cerr << "fixedchar=" << std::setw(10)
// << fixedchar << " -> '" << *source << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
*destination = *source;
++destination;
}
}
// Trim out the new line character
if( *source == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "readline='" << readline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "readline='" << readline << "'" << std::endl;
if( fclose( cfilestream ) ) {
perror( "fclose cfilestream" );
}
free( readline );
return 0;
}
Bonus
You can also use Python C Extensions (API).
It takes about 2.3
seconds to parse 319MB without converting them to cached version UTF-8 char*
And takes about 3.2
seconds to parse 319MB converting them to UTF-8
char*.
And also takes about 3.2
seconds to parse 319MB converting them to cached ASCII
char*.
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <iostream>
typedef struct
{
PyObject_HEAD
}
PyFastFile;
static PyModuleDef fastfilepackagemodule =
{
// https://docs.python.org/3/c-api/module.html#c.PyModuleDef
PyModuleDef_HEAD_INIT,
"fastfilepackage", /* name of module */
"Example module that wrapped a C++ object", /* module documentation, may be NULL */
-1, /* size of per-interpreter state of the module, or
-1 if the module keeps state in global variables. */
NULL, /* PyMethodDef* m_methods */
NULL, /* inquiry m_reload */
NULL, /* traverseproc m_traverse */
NULL, /* inquiry m_clear */
NULL, /* freefunc m_free */
};
// initialize PyFastFile Object
static int PyFastFile_init(PyFastFile* self, PyObject* args, PyObject* kwargs) {
char* filepath;
if( !PyArg_ParseTuple( args, "s", &filepath ) ) {
return -1;
}
int linecount = 0;
PyObject* iomodule;
PyObject* openfile;
PyObject* fileiterator;
iomodule = PyImport_ImportModule( "builtins" );
if( iomodule == NULL ) {
std::cerr << "ERROR: FastFile failed to import the io module '"
"(and open the file " << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* openfunction = PyObject_GetAttrString( iomodule, "open" );
if( openfunction == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module open "
<< "function (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
openfile = PyObject_CallFunction(
openfunction, "ssiss", filepath, "r", -1, "ASCII", "ignore" );
if( openfile == NULL ) {
std::cerr << "ERROR: FastFile failed to open the file'"
<< filepath << "'!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* iterfunction = PyObject_GetAttrString( openfile, "__iter__" );
Py_DECREF( openfunction );
if( iterfunction == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module iterator"
<< "function (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* openiteratorobject = PyObject_CallObject( iterfunction, NULL );
Py_DECREF( iterfunction );
if( openiteratorobject == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module iterator object"
<< " (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
fileiterator = PyObject_GetAttrString( openfile, "__next__" );
Py_DECREF( openiteratorobject );
if( fileiterator == NULL ) {
std::cerr << "ERROR: FastFile failed get the io module iterator "
<< "object (and open the file '" << filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* readline;
while( ( readline = PyObject_CallObject( fileiterator, NULL ) ) != NULL ) {
linecount += 1;
PyUnicode_AsUTF8( readline );
Py_DECREF( readline );
// std::cerr << "linecount " << linecount << " readline '" << readline
// << "' '" << PyUnicode_AsUTF8( readline ) << "'" << std::endl;
}
std::cerr << "linecount " << linecount << std::endl;
// PyErr_PrintEx(100);
PyErr_Clear();
PyObject* closefunction = PyObject_GetAttrString( openfile, "close" );
if( closefunction == NULL ) {
std::cerr << "ERROR: FastFile failed get the close file function for '"
<< filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
PyObject* closefileresult = PyObject_CallObject( closefunction, NULL );
Py_DECREF( closefunction );
if( closefileresult == NULL ) {
std::cerr << "ERROR: FastFile failed close open file '"
<< filepath << "')!" << std::endl;
PyErr_PrintEx(100);
return -1;
}
Py_DECREF( closefileresult );
Py_XDECREF( iomodule );
Py_XDECREF( openfile );
Py_XDECREF( fileiterator );
return 0;
}
// destruct the object
static void PyFastFile_dealloc(PyFastFile* self) {
Py_TYPE(self)->tp_free( (PyObject*) self );
}
static PyTypeObject PyFastFileType =
{
PyVarObject_HEAD_INIT( NULL, 0 )
"fastfilepackage.FastFile" /* tp_name */
};
// create the module
PyMODINIT_FUNC PyInit_fastfilepackage(void)
{
PyObject* thismodule;
// https://docs.python.org/3/c-api/typeobj.html
PyFastFileType.tp_new = PyType_GenericNew;
PyFastFileType.tp_basicsize = sizeof(PyFastFile);
PyFastFileType.tp_dealloc = (destructor) PyFastFile_dealloc;
PyFastFileType.tp_flags = Py_TPFLAGS_DEFAULT;
PyFastFileType.tp_doc = "FastFile objects";
PyFastFileType.tp_init = (initproc) PyFastFile_init;
if( PyType_Ready( &PyFastFileType) < 0 ) {
return NULL;
}
thismodule = PyModule_Create(&fastfilepackagemodule);
if( thismodule == NULL ) {
return NULL;
}
// Add FastFile class to thismodule allowing the use to create objects
Py_INCREF( &PyFastFileType );
PyModule_AddObject( thismodule, "FastFile", (PyObject*) &PyFastFileType );
return thismodule;
}
To built it, create the file source/fastfilewrappar.cpp
with the contents of the above file and the setup.py
with the following contents:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from setuptools import setup, Extension
myextension = Extension(
language = "c++",
extra_link_args = ["-std=c++11"],
extra_compile_args = ["-std=c++11"],
name = 'fastfilepackage',
sources = [
'source/fastfilewrapper.cpp'
],
include_dirs = [ 'source' ],
)
setup(
name = 'fastfilepackage',
ext_modules= [ myextension ],
)
To run example, use following Python
script:
import time
import datetime
import fastfilepackage
testfile = './test.txt'
timenow = time.time()
iterable = fastfilepackage.FastFile( testfile )
fastfile_time = time.time() - timenow
timedifference = datetime.timedelta( seconds=fastfile_time )
print( 'FastFile timedifference', timedifference, flush=True )
Example:
user@user-pc$ /usr/bin/pip3.6 install .
Processing /fastfilepackage
Building wheels for collected packages: fastfilepackage
Building wheel for fastfilepackage (setup.py) ... done
Stored in directory: /pip-ephem-wheel-cache-j313cpzc/wheels/e5/5f/bc/52c820
Successfully built fastfilepackage
Installing collected packages: fastfilepackage
Found existing installation: fastfilepackage 0.0.0
Uninstalling fastfilepackage-0.0.0:
Successfully uninstalled fastfilepackage-0.0.0
Successfully installed fastfilepackage-0.0.0
user@user-pc$ /usr/bin/python3.6 fastfileperformance.py
linecount 820800
FastFile timedifference 0:00:03.204614
Using std::getline
This takes about 4.7
seconds to parse 319MB.
If you remove the UTF-8
removal algorithm borrowed from the fastest benchmark using stdlib.h getline()
, it takes 1.7
seconds to run.
#include <stdlib.h>
#include <iostream>
#include <locale>
#include <fstream>
#include <iomanip>
int main(int argc, char const *argv[])
{
unsigned int fixedchar;
int linecount = -1;
char* source;
char* lineend;
char* destination;
if( ( source = setlocale( LC_ALL, "en_US.ascii" ) ) == NULL ) {
perror( "setlocale" );
return -1;
}
else {
std::cerr << "locale='" << source << "'" << std::endl;
}
std::ifstream fileifstream{ "./test.txt" };
if( fileifstream.fail() ) {
std::cerr << "ERROR: FastFile failed to open the file!" << std::endl;
return -1;
}
size_t linebuffersize = 131072;
char* readline = (char*) malloc( linebuffersize );
if( readline == NULL ) {
perror( "malloc readline" );
return -1;
}
while( true )
{
if( !fileifstream.eof() )
{
linecount += 1;
fileifstream.getline( readline, linebuffersize );
lineend = readline + fileifstream.gcount();
destination = readline;
for( source = readline; source != lineend; ++source )
{
fixedchar = static_cast<unsigned int>( *source );
// std::cerr << "fixedchar=" << std::setw(10)
// << fixedchar << " -> '" << *source << "'" << std::endl;
if( 31 < fixedchar && fixedchar < 128 ) {
*destination = *source;
++destination;
}
}
// Trim out the new line character
if( *source == '\n' ) {
*--destination = '\0';
}
else {
*destination = '\0';
}
// std::cerr << "readline='" << readline << "'" << std::endl;
}
else {
break;
}
}
std::cerr << "linecount='" << linecount << "'" << std::endl;
if( fileifstream.is_open() ) {
fileifstream.close();
}
free( readline );
return 0;
}
Resume
2.6
seconds trimming UTF-8 using two buffers with indexing
3.1
seconds trimming UTF-8 using two buffers with memcpy
4.6
seconds removing invalid UTF-8 with iconv
24.2
seconds removing invalid UTF-8 with mbtowc
2.4
seconds trimming UTF-8 using one buffer with pointer direct assigning
Bonus
2.3
seconds removing invalid UTF-8 without converting them to a cached UTF-8 char*
3.2
seconds removing invalid UTF-8 converting them to a cached UTF-8 char*
3.2
seconds trimming UTF-8 and caching as ASCII char*
4.7
seconds trimming UTF-8 with std::getline()
using one buffer with pointer direct assigning
The used file ./text.txt
had 820.800
lines where each line was equal to:
id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char&id-é-char\r\n
And all versions where compiled with
g++ (GCC) 7.4.0
iconv (GNU libiconv 1.14)
g++ -o main test.cpp -O3 -liconv && time ./main