2

I need to convert cp125*2* text to unicode utf in javascript function.

Function to convert CP125*1* to utf I already find.

Please help me if you have this functionality, thanks!

Daniel
  • 21
  • 1
  • 3
  • Javascript strings are Unicode strings. Where are you getting that cp1252 encoded text from and in what format? a byte array? – Alex Jasmin Nov 09 '10 at 01:59
  • So, I get the raw page html just after it loaded (in my firefox addon), run the javascript function that parse this page by regexp. And to do it correctly I previously need to convert all characters to utf. I can do it correctly on the server side, but not in this task. – Daniel Nov 09 '10 at 02:06

1 Answers1

8

If ISO-8859-1 is close enough, there is a special shortcut to convert ISO-8859-1-bytes-in-code-units to Unicode characters, due to the simple byte=code-point mapping:

var chars= decodeURIComponent(escape(bytes));

For any other encoding there is no built-in functionality; you would have to include your own lookup tables. For example:

var encodings= {
    // Windows code page 1252 Western European
    //
    cp1252: '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\u20ac\ufffd\u201a\u0192\u201e\u2026\u2020\u2021\u02c6\u2030\u0160\u2039\u0152\ufffd\u017d\ufffd\ufffd\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u02dc\u2122\u0161\u203a\u0153\ufffd\u017e\u0178\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff',

    // Windows code page 1251 Cyrillic
    //
    cp1251: '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\u0402\u0403\u201a\u0453\u201e\u2026\u2020\u2021\u20ac\u2030\u0409\u2039\u040a\u040c\u040b\u040f\u0452\u2018\u2019\u201c\u201d\u2022\u2013\u2014\ufffd\u2122\u0459\u203a\u045a\u045c\u045b\u045f\xa0\u040e\u045e\u0408\xa4\u0490\xa6\xa7\u0401\xa9\u0404\xab\xac\xad\xae\u0407\xb0\xb1\u0406\u0456\u0491\xb5\xb6\xb7\u0451\u2116\u0454\xbb\u0458\u0405\u0455\u0457\u0410\u0411\u0412\u0413\u0414\u0415\u0416\u0417\u0418\u0419\u041a\u041b\u041c\u041d\u041e\u041f\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427\u0428\u0429\u042a\u042b\u042c\u042d\u042e\u042f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u044f'
};

function decodeBytes(bytes, encoding) {
    var enc= encodings[encoding];
    var n= bytes.length;
    var chars= new Array(n);
    for (var i= 0; i<n; i++)
        chars[i]= enc.charAt(bytes.charCodeAt(i));
    return chars.join('');
}

alert(decodeBytes('\xc7\xe4\xf0\xe0\xe2\xf1\xf2\xe2\xf3\xe9 \xec\xe8\xf0', 'cp1251'));
// '\u0417\u0434\u0440\u0430\u0432\u0441\u0442\u0432\u0443\u0439 \u043c\u0438\u0440'
// Здравствуй мир

ETA:

So, I get the raw page html just after it loaded (in my firefox addon), run the javascript function that parse this page by regexp.

Yeah, don't do that. You can't parse HTML with regex.

Why not let Firefox take care of parsing the page for its given charset?

ETA(2):

'koi8-r': '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\u2500\u2502\u250c\u2510\u2514\u2518\u251c\u2524\u252c\u2534\u253c\u2580\u2584\u2588\u258c\u2590\u2591\u2592\u2593\u2320\u25a0\u2219\u221a\u2248\u2264\u2265\xa0\u2321\xb0\xb2\xb7\xf7\u2550\u2551\u2552\u0451\u2553\u2554\u2555\u2556\u2557\u2558\u2559\u255a\u255b\u255c\u255d\u255e\u255f\u2560\u2561\u0401\u2562\u2563\u2564\u2565\u2566\u2567\u2568\u2569\u256a\u256b\u256c\xa9\u044e\u0430\u0431\u0446\u0434\u0435\u0444\u0433\u0445\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u044f\u0440\u0441\u0442\u0443\u0436\u0432\u044c\u044b\u0437\u0448\u044d\u0449\u0447\u044a\u042e\u0410\u0411\u0426\u0414\u0415\u0424\u0413\u0425\u0418\u0419\u041a\u041b\u041c\u041d\u041e\u041f\u042f\u0420\u0421\u0422\u0423\u0416\u0412\u042c\u042b\u0417\u0428\u042d\u0429\u0427\u042a'

(You can grab the mappings for single-byte encodings out of Python, saying something like:)

>>> ''.join(map(chr, range(256))).decode('koi8-r', 'replace')

I don't know how you're going about reading the input stream, but you shouldn't normally need to be doing this kind of encoding unmangling manually.

bobince
  • 528,062
  • 107
  • 651
  • 834
  • Bobince! Thank you so much for the help, unfortunatelly it seems page coded in KOI8-R , cause CP1252 didn't get a lot of help, after use your function results still unreadable, but if I wget html page and copy paste results in browser console with you function - it's ok to read answer. I really in the confuse with this situation :( – Daniel Nov 09 '10 at 22:46
  • @bobince, great answer, I've improved your decodeBytes routine to only update 0x00 - 0xff and passthru the rest: `function decodeBytes(bytes, encoding) { return bytes.replace(/[\x00-\xff]/g, function(c) { return encodings[encoding].charAt(c.charCodeAt(0)); } ); }` – Stephen Quan Apr 28 '15 at 01:51