0

I tried to use expat for XML parsing, and I have Russian symbols in XML file, this symbols incorrectly interpret by expat.

I got expired_str: Русский текст

Instead of: Русский текст

Here is my cutted code:

    static int xmlParseStartup(char *buffer, int n, START_UP_T *startUp_sp)
    {
        void *buff;
        XML_Parser parser_p = XML_ParserCreate("UTF-8");

        if (!parser_p)
        {
            DEBUG("Unable to create parser!\n");
            return RES_ERR;
        }
        XML_SetUserData(parser_p, (void *)startUp_sp);
        XML_SetElementHandler(parser_p, startElement, endElement);

        buff = XML_GetBuffer(parser_p, n);

        memcpy(buff, buffer, n);

        if (XML_STATUS_ERROR == XML_ParseBuffer(parser_p, n, TRUE)) 
        {
            DEBUG("%s at line %" XML_FMT_INT_MOD "u\n",
                  XML_ErrorString(XML_GetErrorCode(parser_p)),
                  XML_GetCurrentLineNumber(parser_p));
            return RES_ERR;
        }
        return RES_OK;
    }

static void XMLCALL startElement(void       *userData,
                                 const char *name,
                                 const char **atts)
{
    int i;
    START_UP_T *startUp_sp = (START_UP_T *)userData;
    for (i = 0; i < startUp_sp->depthPtr; i++)
    {
        fprintf(stderr, ".");
    }
    DEBUG("[%d]name: %s\n", startUp_sp->depthPtr, name);

    if (0 == strcmp(name, "response"))
    {
        if (0 == strcmp(atts[i], "result"))
        {
            startUp_sp->result = atoi(atts[3]);
            DEBUG("RESULT: %d\n", startUp_sp->result);
        }
        else if (0 == strcmp(atts[i], "status_str"))
        {
            strcpy(startUp_sp->expired_str, atts[3]);
            DEBUG("EXPIRED_STR: %s\n", startUp_sp->expired_str);
        }
        else if (0 == strcmp(atts[i], "status_width"))
        {
            startUp_sp->status_width = atoi(atts[3]);
        }
    }

    startUp_sp->depthPtr += 1;
}

static void XMLCALL endElement(void       *userData,
                               const char *name)
{
    START_UP_T *startUp_sp = (START_UP_T *)userData;
    startUp_sp->depthPtr -= 1;
}

XML file:

<?xml version="1.0" encoding="UTF-8"?>
<startup>
    <response name="result" value="0"/>
    <response name="status_str" value="Русский текст"/>
    <response name="status_width" value="120"/>
</startup>
Yury Bushev
  • 642
  • 9
  • 25

1 Answers1

1

You get a cp1251 representation of the UTF-8 char*-typed string, so the expat actually works fine - it is the console output you're having problems with.

If it is not the case, check for the utf8 marker at the beginning of the xml file (239, 187, 191 bytes in ASCII codes, or "п>ї" without quotes in CP-1251).

One more: You should check the actual encoding of the .xml file, looks like it is not what you think it is (utf-8). What editor do you use to create the file ?

The CP1251 representation of UTF-8 "Русский текст" string is "Р С_С_С_РєРёР№ С'РчРєС_С'".

Viktor Latypov
  • 14,289
  • 3
  • 40
  • 55
  • Now I have: XML file encoding: (ANSI as UTF-8) I confused here, I use nodepad++. File *.c with main function saved as "UTF-8" by VS. I now I got incorrect output after read XML file like this: "" – Yury Bushev Dec 29 '12 at 19:25
  • I revert back changes in *.c files (to windows-1251), XML file in the same encoding now. Also I call XML_ParserCreate routine with NULL pointer. I got two stdout outputs. Before parsing: "". After parsing: "EXPIRED_STR: Р СѓСЃСЃРєРёР№ текст". How to fix that? – Yury Bushev Dec 29 '12 at 19:45
  • ANSI as UTF-8 means "No unicode marker at the beginning". You should save it as Unicode. – Viktor Latypov Dec 29 '12 at 20:53