4

Given a str as following:

let str = "#include \\u003Cunordered_map\\u003E\\u000D\\u000A"

How do I decode unicode-escape string into a unicode string or in may case Ascii string in OCaml?

In python I could easily do

str.decode("unicode-escape")
Oliver Young
  • 578
  • 1
  • 4
  • 12

1 Answers1

3

If your embedded escape sequences are always going to encode ASCII characters, as you say, you can find them and replace them with the decoded equivalent:

let decode s =
    let re = Str.regexp "\\\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]" in
    let s1 n = String.make 1 (Char.chr n) in
    let subst = function
    | Str.Delim u -> s1 (int_of_string ("0x" ^ String.sub u 2 4))
    | Str.Text t -> t
    in
    String.concat "" (List.map subst (Str.full_split re s))

This works for your example:

val decode : string -> string = <fun>
# decode "#include \\u003Cunordered_map\\u003E\\u000D\\u000A";;
- : string = "#include <unordered_map>\r\n"

Indeed, Python has built-in support to decode these sequences.

Update

To support all four-digit hex escape sequences "\uXXXX" by converting to UTF-8 you can use this code:

let utf8encode s =
    let prefs = [| 0x0; 0xc0; 0xe0 |] in
    let s1 n = String.make 1 (Char.chr n) in
    let rec ienc k sofar resid =
        let bct = if k = 0 then 7 else 6 - k in
        if resid < 1 lsl bct then
            (s1 (prefs.(k) + resid)) ^ sofar
        else
            ienc (k + 1) (s1 (0x80 + resid mod 64) ^ sofar) (resid / 64)
    in
    ienc 0 "" (int_of_string ("0x" ^ s))

let decode2 s =
    let re = Str.regexp "\\\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]" in
    let subst = function
    | Str.Delim u -> utf8encode (String.sub u 2 4)
    | Str.Text t -> t
    in
    String.concat "" (List.map subst (Str.full_split re s))

It also works for your example, and some other examples:

val utf8encode : string -> string = <fun>
val decode2 : string -> string = <fun>
# decode2 "#include \\u003Cunordered_map\\u003E\\u000D\\u000A";;
- : string = "#include <unordered_map>\r\n"
# print_endline (decode2 "\\u00A2");;
¢
- : unit = ()
# print_endline (decode2 "\\u20AC");;
€
- : unit = ()
Jeffrey Scofield
  • 65,646
  • 2
  • 72
  • 108
  • Thanks. That works for ASCII-encoded escape sequence. What if I have utf-8 encoded escape sequence. Does ocaml have easy-to-use library for that. In python, I could easy do following: str = str.encode("utf-8").decode("unicode-escape") – Oliver Young Feb 25 '17 at 17:02
  • There is some UTF-8 support in [Batteries](http://ocaml-batteries-team.github.io/batteries-included/hdoc2/BatUTF8.html). To be honest I would probably code up my own conversion unless there was UTF-8 support in a library I was already using. It's a simple conversion; I'll add code above. – Jeffrey Scofield Feb 25 '17 at 20:07
  • Thanks for the detailed answer! Very useful – Oliver Young Feb 25 '17 at 21:08
  • Just to confirm, uft8encode you give only cover up to 3 bytes utf8. 4 bytes utf8 is not covered, right? – Oliver Young Feb 26 '17 at 00:16
  • Yes. I tested in Python and only 4-hex-digit escapes are supported as far as I could tell. It's a small change to support an extra byte, the code is set up for it. Just add the next prefix (0xf0). – Jeffrey Scofield Feb 26 '17 at 00:21
  • For a fifth byte just add the *next* prefix (0xf8). For six bytes also add 0xfc. – Jeffrey Scofield Feb 26 '17 at 00:27
  • Yeah, that's what I thought. Thanks again. – Oliver Young Feb 26 '17 at 00:30