I have some *.bz2 file, that contains *.csv text file.
I need to process it - to unpack it and to change encoding of it.
Now I have code, that unpacks this file and changes *.csv file encoding by reading original file to another one. But I "got situation" with this moment - *.csv file is huge (5 GB), but server has only 8 GB. IE, if I will change file encoding by copying it to another one, I will not have hdd space for it.
So, I need to change encoding of the file "on-fly".
Currently, I have two pieces of code:
Unpacking
bool NetIO::unBZ2(string PathToBZ2, string PathToCSV) {
try {
boost::locale::generator gen;
ofstream outFile("c:/temp/sqlShare/qqq.csv", std::ofstream::out | std::ofstream::binary);
ifstream file(PathToBZ2, ios_base::in | ios_base::binary);
filtering_streambuf<input> in;
in.push(bzip2_decompressor());
in.push(file);
boost::iostreams::copy(in, outFile);
file.close();
outFile.close();
}
catch (const bzip2_error& exception) {
int error = exception.error();
if (error == boost::iostreams::bzip2::data_error) {
// compressed data stream is corrupted
cout << "compressed data stream is corrupted";
}
else if (error == boost::iostreams::bzip2::data_error_magic)
{
// compressed data stream does not begin with the 'magic' sequence 'B' 'Z' 'h'
cout << "compressed data stream does not begin with the 'magic' sequence 'B' 'Z' 'h'";
}
else if (error == boost::iostreams::bzip2::config_error) {
// libbzip2 has been improperly configured for the current platform
cout << "libbzip2 has been improperly configured for the current platform";
}
return false;
}
return true;
}
and code for changing encoding of file
bool NetIO::replaceLineBreaks(const string& inFilePath, const string& searchStr, const string& replacement) {
try {
boost::locale::generator gen;
ifstream iss(inFilePath.c_str(), std::ios::in | std::ios::binary);
std::locale lru = gen("ru_RU.CP1251");
std::locale lru2 = gen("ru_RU.UTF-8");
iss.imbue(lru2);
//string tempFilePath2 = inFilePath + ".tmp3";
string tempFilePath2 = "d:/temp/qqq.tmp3";
ofstream os(tempFilePath2, std::ios::out | std::ios::binary);
os.imbue(lru);
const int buffSize= 500000;
char qqq[buffSize];
//wchar_t wqqq[buffSize];
while (iss) {
iss.read(qqq, buffSize);
// MultiByteToWideChar(CP_UTF8, 0, qqq, buffSize, wqqq, buffSize);
os << qqq;
//cnt++;
/*if (cnt > 80) {
break;
}*/
}
os.close();
/*boost::iostreams::mapped_file istm(inFilePath.c_str());
if (!istm.is_open())
return false;
string tempFilePath = inFilePath + ".tmp";
ofstream ostm(tempFilePath.c_str(), std::ios::out | std::ios::binary);
if (!ostm.is_open()) {
istm.close();
return false;
}
boost::regex regexp(searchStr);
ostreambuf_iterator<char> it(ostm);
boost::regex_replace(it, istm.begin(), istm.end(), regexp, replacement, boost::match_default | boost::format_all);
istm.close();
ostm.close();
boost::filesystem::rename(tempFilePath, inFilePath);*/
}
catch (exception ex) {
cout << "Problem in line endings replacer" << std::endl;
cout << ex.what() << std::endl;
cout << "-----------------" << std::endl;
return false;
}
return true;
}
I am trying to understand, how can I combine these two pieces of code?
As I understand, possible solution could be to create a custom ofstream
object, that will receive data and will convert encoding of it, and to pass data to it with boost::iostreams::copy
.
How can I combine these two pieces of code?
Thank you very much!