libxml2使用说明

libxml2使用说明Libxml2 库的编译和使用 Libxml2 库提供了 C 语言解析和构造 xml 文档的接口 为后台 C 语言处理程序和前台应用程序提供了一种通用的通迅方式

大家好,欢迎来到IT知识分享网。Libxml2库的编译和使用

Libxml2库提供了C语言解析和构造xml文档的接口,为后台C语言处理程序和前台应用程序提供了一种通用的通迅方式。

本文以libxml2-2.6.30版本来说明Libxml2库的使用方法。

1.    编译库文件

libxml2-2.6.30.tar.gz文件解压后,进入libxml2-2.6.30文件夹,顺序执行以下命令:

chmod +x ./configure

./configure

make

make install

“chmod +x ./configure”命令增加configure脚本的可执行权限;

“./configure”脚本根据当前编译系统的实际情况生成相应的makefile文件;

“make”命令执行上一命令中生成的makefile文件生成相应的目标文件;

“make install”命令主要把目标文件拷贝到/usr/local目录下,

/usr/local/lib目录下为以下库文件:

libxml2.a  libxml2.la  libxml2.so  libxml2.so.2  libxml2.so.2.6.30  pkgconfig  xml2Conf.sh

/usr/local/include/libxml2目录是Libxml库使用时需要的头文件,包含在libxml子目录下;

2.    使用Libxml2库

Libxml2库的api参考可以从http://www.xmlsoft.org/html/index.html查询。下面以解析一个简单的xml文件为例,给出一个完整的例子。

Xml文档:

<ioMsg>

    <type>she</type>

    <subtype>

       <st1>123</st1>

       <st2>563</st2>

    </subtype>

</ioMsg>

C解析代码xmltest.c:

view plaincopy to clipboardprint?

   1. #include <libxml/parser.h>  

   2. #include <libxml/tree.h>  

   3.   

   4. int main(int argc, char* argv[])  

   5. {  

   6.     xmlDocPtr doc;           //定义解析文档指针  

   7.     xmlNodePtr curNode;      //定义结点指针(你需要它为了在各个结点间移动)  

   8.     xmlChar *szKey;          //临时字符串变量  

   9.     char *szDocName;  

  10.       

  11.     if (argc <= 1)   

  12.     {  

  13.        printf(“Usage: %s docname\n”, argv[0]);  

  14.        return(0);  

  15.     }  

  16.     szDocName = argv[1];  

  17.     doc = xmlReadFile(szDocName,”GB2312″,XML_PARSE_RECOVER); //解析文件  

  18.     if (NULL == doc)  

  19.     {    

  20.        printf(“Document not parsed successfully\n”);      

  21.        return -1;  

  22.     }  

  23.     curNode = xmlDocGetRootElement(doc); //确定文档根元素  

  24.     if (NULL == curNode)  

  25.     {  

  26.        printf(“empty document\n”);  

  27.        xmlFreeDoc(doc);  

  28.        return -1;  

  29.     }  

  30.     if (xmlStrcmp(curNode->name, BAD_CAST “ioMsg”))  

  31.     {  

  32.        printf(“document of the wrong type, root node != ioMsg\n”);  

  33.        xmlFreeDoc(doc);  

  34.        return -1;  

  35.     }  

  36.     curNode = curNode->children;  

  37.     while(curNode != NULL)  

  38.     {  

  39.        //取出节点中的内容  

  40.        szKey = xmlNodeGetContent(curNode);  

  41.        printf(“Content value =%s\n”, szKey);  

  42.        curNode = curNode->next;  

  43.      }  

  44.      xmlFreeDoc(doc);  

  45.     return 0;     

  46. }  

#include <libxml/parser.h> #include <libxml/tree.h> int main(int argc, char* argv[]) { xmlDocPtr doc; //定义解析文档指针 xmlNodePtr curNode; //定义结点指针(你需要它为了在各个结点间移动) xmlChar *szKey; //临时字符串变量 char *szDocName; if (argc <= 1) { printf(“Usage: %s docname\n”, argv[0]); return(0); } szDocName = argv[1]; doc = xmlReadFile(szDocName,”GB2312″,XML_PARSE_RECOVER); //解析文件 if (NULL == doc) { printf(“Document not parsed successfully\n”); return -1; } curNode = xmlDocGetRootElement(doc); //确定文档根元素 if (NULL == curNode) { printf(“empty document\n”); xmlFreeDoc(doc); return -1; } if (xmlStrcmp(curNode->name, BAD_CAST “ioMsg”)) { printf(“document of the wrong type, root node != ioMsg\n”); xmlFreeDoc(doc); return -1; } curNode = curNode->children; while(curNode != NULL) { //取出节点中的内容 szKey = xmlNodeGetContent(curNode); printf(“Content value =%s\n”, szKey); curNode = curNode->next; } xmlFreeDoc(doc); return 0; }

3.    编译xml解析程序

假设Libxml2库是按步骤1的编译方式,其库文件和头文件分别位于/usr/local/lib和/usr/local/include/libxml2目录下。

动态库编译方式:

cc -o xmltest -I/usr/local/include/libxml2 -L/usr/local/lib -lxml2 xmltest.c

 

静态库的编译方式:

cc -o xmltest -lm -I/usr/local/include/libxml2 xmltest.c /usr/local/lib/libxml2.a

“-I/usr/local/include/libxml2”指定Libxml2库的头文件所在的路径,“-L/usr/local/lib”指定动态库所在路径。

-需要libxml2.a外,还要libz.a和-lm


支持GB2312的LIBXML2库源代码的修改

xmlSaveFileEnc( this->szConfigFile, this->m_doc, “GB2312” );

Libxml2是开源的xml解释器,使用中发现它不支持中文,于是自已加了些代码,支持了GB2312,改了如下代码

//encoding.c

xmlCharEncoding

xmlParseCharEncoding(const char* name)

{

    const char *alias;

    char upper[500];

    int i;

    if (name == NULL)

    return(XML_CHAR_ENCODING_NONE);

    ///*

     * Do the alias resolution

     */

    alias = xmlGetEncodingAlias(name);

    if (alias != NULL)

    name = alias;

    for (i = 0;i < 499;i++) {

        upper[i] = toupper(name[i]);

    if (upper[i] == 0) break;

    }

    upper[i] = 0;

    if (!strcmp(upper, “”)) return(XML_CHAR_ENCODING_NONE);

    if (!strcmp(upper, “UTF-8”)) return(XML_CHAR_ENCODING_UTF8);

    if (!strcmp(upper, “UTF8”)) return(XML_CHAR_ENCODING_UTF8);

    //中文化处理,让该XML分析器支持中文GB2312 BY JRuiui.NET 2005.12.23

    if (!strcmp(upper, “GB2312”)) return (XML_CHAR_ENCODING_GB2312);

    ///*

     * NOTE: if we were able to parse this, the endianness of UTF16 is

     *       already found and in use

     */

    if (!strcmp(upper, “UTF-16”)) return(XML_CHAR_ENCODING_UTF16LE);

    if (!strcmp(upper, “UTF16”)) return(XML_CHAR_ENCODING_UTF16LE);

    

    if (!strcmp(upper, “ISO-10646-UCS-2”)) return(XML_CHAR_ENCODING_UCS2);

    if (!strcmp(upper, “UCS-2”)) return(XML_CHAR_ENCODING_UCS2);

    if (!strcmp(upper, “UCS2”)) return(XML_CHAR_ENCODING_UCS2);

    ///*

     * NOTE: if we were able to parse this, the endianness of UCS4 is

     *       already found and in use

     */

    if (!strcmp(upper, “ISO-10646-UCS-4”)) return(XML_CHAR_ENCODING_UCS4LE);

    if (!strcmp(upper, “UCS-4”)) return(XML_CHAR_ENCODING_UCS4LE);

    if (!strcmp(upper, “UCS4”)) return(XML_CHAR_ENCODING_UCS4LE);

    

    if (!strcmp(upper,  “ISO-8859-1”)) return(XML_CHAR_ENCODING_8859_1);

    if (!strcmp(upper,  “ISO-LATIN-1”)) return(XML_CHAR_ENCODING_8859_1);

    if (!strcmp(upper,  “ISO LATIN 1”)) return(XML_CHAR_ENCODING_8859_1);

    if (!strcmp(upper,  “ISO-8859-2”)) return(XML_CHAR_ENCODING_8859_2);

    if (!strcmp(upper,  “ISO-LATIN-2”)) return(XML_CHAR_ENCODING_8859_2);

    if (!strcmp(upper,  “ISO LATIN 2”)) return(XML_CHAR_ENCODING_8859_2);

    if (!strcmp(upper,  “ISO-8859-3”)) return(XML_CHAR_ENCODING_8859_3);

    if (!strcmp(upper,  “ISO-8859-4”)) return(XML_CHAR_ENCODING_8859_4);

    if (!strcmp(upper,  “ISO-8859-5”)) return(XML_CHAR_ENCODING_8859_5);

    if (!strcmp(upper,  “ISO-8859-6”)) return(XML_CHAR_ENCODING_8859_6);

    if (!strcmp(upper,  “ISO-8859-7”)) return(XML_CHAR_ENCODING_8859_7);

    if (!strcmp(upper,  “ISO-8859-8”)) return(XML_CHAR_ENCODING_8859_8);

    if (!strcmp(upper,  “ISO-8859-9”)) return(XML_CHAR_ENCODING_8859_9);

    if (!strcmp(upper, “ISO-2022-JP”)) return(XML_CHAR_ENCODING_2022_JP);

    if (!strcmp(upper, “SHIFT_JIS”)) return(XML_CHAR_ENCODING_SHIFT_JIS);

    if (!strcmp(upper, “EUC-JP”)) return(XML_CHAR_ENCODING_EUC_JP);

#ifdef DEBUG_ENCODING

    xmlGenericError(xmlGenericErrorContext, “Unknown encoding %s “, name);

#endif

    return(XML_CHAR_ENCODING_ERROR);

}

const char*

xmlGetCharEncodingName(xmlCharEncoding enc) {

    switch (enc) {

        ///*中文化处理GB2312编码*/

        case XML_CHAR_ENCODING_GB2312:

        return(“GB2312”);

        case XML_CHAR_ENCODING_ERROR:

        return(NULL);

        case XML_CHAR_ENCODING_NONE:

        return(NULL);

        case XML_CHAR_ENCODING_UTF8:

        return(“UTF-8”);

        case XML_CHAR_ENCODING_UTF16LE:

        return(“UTF-16”);

        case XML_CHAR_ENCODING_UTF16BE:

        return(“UTF-16”);

        case XML_CHAR_ENCODING_EBCDIC:

            return(“EBCDIC”);

        case XML_CHAR_ENCODING_UCS4LE:

            return(“ISO-10646-UCS-4”);

        case XML_CHAR_ENCODING_UCS4BE:

            return(“ISO-10646-UCS-4”);

        case XML_CHAR_ENCODING_UCS4_2143:

            return(“ISO-10646-UCS-4”);

        case XML_CHAR_ENCODING_UCS4_3412:

            return(“ISO-10646-UCS-4”);

        case XML_CHAR_ENCODING_UCS2:

            return(“ISO-10646-UCS-2”);

        case XML_CHAR_ENCODING_8859_1:

        return(“ISO-8859-1”);

        case XML_CHAR_ENCODING_8859_2:

        return(“ISO-8859-2”);

        case XML_CHAR_ENCODING_8859_3:

        return(“ISO-8859-3”);

        case XML_CHAR_ENCODING_8859_4:

        return(“ISO-8859-4”);

        case XML_CHAR_ENCODING_8859_5:

        return(“ISO-8859-5”);

        case XML_CHAR_ENCODING_8859_6:

        return(“ISO-8859-6”);

        case XML_CHAR_ENCODING_8859_7:

        return(“ISO-8859-7”);

        case XML_CHAR_ENCODING_8859_8:

        return(“ISO-8859-8”);

        case XML_CHAR_ENCODING_8859_9:

        return(“ISO-8859-9”);

        case XML_CHAR_ENCODING_2022_JP:

            return(“ISO-2022-JP”);

        case XML_CHAR_ENCODING_SHIFT_JIS:

            return(“Shift-JIS”);

        case XML_CHAR_ENCODING_EUC_JP:

            return(“EUC-JP”);

    case XML_CHAR_ENCODING_ASCII:

        return(NULL);

    }

    return(NULL);

}

//parserInternals.c

int

xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {

    if (ctxt->instate == XML_PARSER_EOF)

    return(0);

    if (ctxt->token != 0) {

    *len = 0;

    return(ctxt->token);

    }    

    if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) {

        *len = 1;

        return((int) *ctxt->input->cur);

    }

    //中文化处理

    if (!strcmp(ctxt->input->encoding,”GB2312″))

    {

        ctxt->charset = XML_CHAR_ENCODING_GB2312; //中文GB2312

        *len = 1;

        return((int) *ctxt->input->cur);

    }    

    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {

    ///*

     * We are supposed to handle UTF8, check it’s valid

     * From rfc2044: encoding of the Unicode values on UTF-8:

     *

     * UCS-4 range (hex.)           UTF-8 octet sequence (binary)

     * 0000 0000-0000 007F   0xxxxxxx

     * 0000 0080-0000 07FF   110xxxxx 10xxxxxx

     * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx

     *

     * Check for the 0x limit too

     */

    const unsigned char *cur = ctxt->input->cur;

    unsigned char c;

    unsigned int val;

    c = *cur;

    if (c & 0x80) {

        if (cur[1] == 0)

        xmlParserInputGrow(ctxt->input, INPUT_CHUNK);

        if ((cur[1] & 0xc0) != 0x80)

        goto encoding_error;

        if ((c & 0xe0) == 0xe0) {

        if (cur[2] == 0)

            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);

        if ((cur[2] & 0xc0) != 0x80)

            goto encoding_error;

        if ((c & 0xf0) == 0xf0) {

            if (cur[3] == 0)

            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);

            if (((c & 0xf8) != 0xf0) ||

            ((cur[3] & 0xc0) != 0x80))

            goto encoding_error;

            ///* 4-byte code */

            *len = 4;

            val = (cur[0] & 0x7) << 18;

            val |= (cur[1] & 0x3f) << 12;

            val |= (cur[2] & 0x3f) << 6;

            val |= cur[3] & 0x3f;

        } else {

          ///* 3-byte code */

            *len = 3;

            val = (cur[0] & 0xf) << 12;

            val |= (cur[1] & 0x3f) << 6;

            val |= cur[2] & 0x3f;

        }

        } else {

          ///* 2-byte code */

        *len = 2;

        val = (cur[0] & 0x1f) << 6;

        val |= cur[1] & 0x3f;

        }

        if (!IS_CHAR(val)) {

        if ((ctxt->sax != NULL) &&

            (ctxt->sax->error != NULL))

            ctxt->sax->error(ctxt->userData,

                     “Char 0x%X out of allowed range “, val);

        ctxt->errNo = XML_ERR_INVALID_ENCODING;

        ctxt->wellFormed = 0;

        ctxt->disableSAX = 1;

        }    

        return(val);

    } else {

        ///* 1-byte code */

        *len = 1;

        if (*ctxt->input->cur == 0xD) {

        if (ctxt->input->cur[1] == 0xA) {

            ctxt->nbChars++;

            ctxt->input->cur++;

        }

        return(0xA);

        }

        return((int) *ctxt->input->cur);

    }

    }

    ///*

     * Assume it’s a fixed length encoding (1) with

     * a compatibke encoding for the ASCII set, since

     * XML constructs only use < 128 chars

     */

    *len = 1;

    if (*ctxt->input->cur == 0xD) {

    if (ctxt->input->cur[1] == 0xA) {

        ctxt->nbChars++;

        ctxt->input->cur++;

    }

    return(0xA);

    }

    return((int) *ctxt->input->cur);

encoding_error:

    ///*

     * If we detect an UTF8 error that probably mean that the

     * input encoding didn’t get properly advertized in the

     * declaration header. Report the error and switch the encoding

     * to ISO-Latin-1 (if you don’t like this policy, just declare the

     * encoding !)

     */

    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {

    ctxt->sax->error(ctxt->userData,

             “Input is not proper UTF-8, indicate encoding ! “);

    ctxt->sax->error(ctxt->userData, “Bytes: 0x%02X 0x%02X 0x%02X 0x%02X “,

            ctxt->input->cur[0], ctxt->input->cur[1],

            ctxt->input->cur[2], ctxt->input->cur[3]);

    }

    ctxt->errNo = XML_ERR_INVALID_ENCODING;

    ctxt->charset = XML_CHAR_ENCODING_8859_1;

    *len = 1;

    return((int) *ctxt->input->cur);

}

//encoding.h

typedef enum {

    XML_CHAR_ENCODING_ERROR=   -1, ///* No char encoding detected */

    XML_CHAR_ENCODING_NONE=    0, ///* No char encoding detected */

    XML_CHAR_ENCODING_UTF8=    1, ///* UTF-8 */

    XML_CHAR_ENCODING_UTF16LE=    2, ///* UTF-16 little endian */

    XML_CHAR_ENCODING_UTF16BE=    3, ///* UTF-16 big endian */

    XML_CHAR_ENCODING_UCS4LE=    4, ///* UCS-4 little endian */

    XML_CHAR_ENCODING_UCS4BE=    5, ///* UCS-4 big endian */

    XML_CHAR_ENCODING_EBCDIC=    6, ///* EBCDIC uh! */

    XML_CHAR_ENCODING_UCS4_2143=7, ///* UCS-4 unusual ordering */

    XML_CHAR_ENCODING_UCS4_3412=8, ///* UCS-4 unusual ordering */

    XML_CHAR_ENCODING_UCS2=    9, ///* UCS-2 */

    XML_CHAR_ENCODING_8859_1=    10,///* ISO-8859-1 ISO Latin 1 */

    XML_CHAR_ENCODING_8859_2=    11,///* ISO-8859-2 ISO Latin 2 */

    XML_CHAR_ENCODING_8859_3=    12,///* ISO-8859-3 */

    XML_CHAR_ENCODING_8859_4=    13,///* ISO-8859-4 */

    XML_CHAR_ENCODING_8859_5=    14,///* ISO-8859-5 */

    XML_CHAR_ENCODING_8859_6=    15,///* ISO-8859-6 */

    XML_CHAR_ENCODING_8859_7=    16,///* ISO-8859-7 */

    XML_CHAR_ENCODING_8859_8=    17,///* ISO-8859-8 */

    XML_CHAR_ENCODING_8859_9=    18,///* ISO-8859-9 */

    XML_CHAR_ENCODING_2022_JP=  19,///* ISO-2022-JP */

    XML_CHAR_ENCODING_SHIFT_JIS=20,///* Shift_JIS */

    XML_CHAR_ENCODING_EUC_JP=   21,///* EUC-JP */

    XML_CHAR_ENCODING_ASCII=    22, ///* pure ASCII */

    XML_CHAR_ENCODING_GB2312 = 23 ///*GB2312中文化处理*/

} xmlCharEncoding;

免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://haidsoft.com/142498.html

(0)
上一篇 2025-05-09 22:00
下一篇 2025-05-09 22:10

相关推荐

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注

关注微信