大家好,欢迎来到IT知识分享网。Libxml2库的编译和使用
Libxml2库提供了C语言解析和构造xml文档的接口,为后台C语言处理程序和前台应用程序提供了一种通用的通迅方式。
本文以libxml2-2.6.30版本来说明Libxml2库的使用方法。
1. 编译库文件
libxml2-2.6.30.tar.gz文件解压后,进入libxml2-2.6.30文件夹,顺序执行以下命令:
chmod +x ./configure
./configure
make
make install
“chmod +x ./configure”命令增加configure脚本的可执行权限;
“./configure”脚本根据当前编译系统的实际情况生成相应的makefile文件;
“make”命令执行上一命令中生成的makefile文件生成相应的目标文件;
“make install”命令主要把目标文件拷贝到/usr/local目录下,
/usr/local/lib目录下为以下库文件:
libxml2.a libxml2.la libxml2.so libxml2.so.2 libxml2.so.2.6.30 pkgconfig xml2Conf.sh
/usr/local/include/libxml2目录是Libxml库使用时需要的头文件,包含在libxml子目录下;
2. 使用Libxml2库
Libxml2库的api参考可以从http://www.xmlsoft.org/html/index.html查询。下面以解析一个简单的xml文件为例,给出一个完整的例子。
Xml文档:
<ioMsg>
<type>she</type>
<subtype>
<st1>123</st1>
<st2>563</st2>
</subtype>
</ioMsg>
C解析代码xmltest.c:
view plaincopy to clipboardprint?
1. #include <libxml/parser.h>
2. #include <libxml/tree.h>
3.
4. int main(int argc, char* argv[])
5. {
6. xmlDocPtr doc; //定义解析文档指针
7. xmlNodePtr curNode; //定义结点指针(你需要它为了在各个结点间移动)
8. xmlChar *szKey; //临时字符串变量
9. char *szDocName;
10.
11. if (argc <= 1)
12. {
13. printf(“Usage: %s docname\n”, argv[0]);
14. return(0);
15. }
16. szDocName = argv[1];
17. doc = xmlReadFile(szDocName,”GB2312″,XML_PARSE_RECOVER); //解析文件
18. if (NULL == doc)
19. {
20. printf(“Document not parsed successfully\n”);
21. return -1;
22. }
23. curNode = xmlDocGetRootElement(doc); //确定文档根元素
24. if (NULL == curNode)
25. {
26. printf(“empty document\n”);
27. xmlFreeDoc(doc);
28. return -1;
29. }
30. if (xmlStrcmp(curNode->name, BAD_CAST “ioMsg”))
31. {
32. printf(“document of the wrong type, root node != ioMsg\n”);
33. xmlFreeDoc(doc);
34. return -1;
35. }
36. curNode = curNode->children;
37. while(curNode != NULL)
38. {
39. //取出节点中的内容
40. szKey = xmlNodeGetContent(curNode);
41. printf(“Content value =%s\n”, szKey);
42. curNode = curNode->next;
43. }
44. xmlFreeDoc(doc);
45. return 0;
46. }
#include <libxml/parser.h> #include <libxml/tree.h> int main(int argc, char* argv[]) { xmlDocPtr doc; //定义解析文档指针 xmlNodePtr curNode; //定义结点指针(你需要它为了在各个结点间移动) xmlChar *szKey; //临时字符串变量 char *szDocName; if (argc <= 1) { printf(“Usage: %s docname\n”, argv[0]); return(0); } szDocName = argv[1]; doc = xmlReadFile(szDocName,”GB2312″,XML_PARSE_RECOVER); //解析文件 if (NULL == doc) { printf(“Document not parsed successfully\n”); return -1; } curNode = xmlDocGetRootElement(doc); //确定文档根元素 if (NULL == curNode) { printf(“empty document\n”); xmlFreeDoc(doc); return -1; } if (xmlStrcmp(curNode->name, BAD_CAST “ioMsg”)) { printf(“document of the wrong type, root node != ioMsg\n”); xmlFreeDoc(doc); return -1; } curNode = curNode->children; while(curNode != NULL) { //取出节点中的内容 szKey = xmlNodeGetContent(curNode); printf(“Content value =%s\n”, szKey); curNode = curNode->next; } xmlFreeDoc(doc); return 0; }
3. 编译xml解析程序
假设Libxml2库是按步骤1的编译方式,其库文件和头文件分别位于/usr/local/lib和/usr/local/include/libxml2目录下。
动态库编译方式:
cc -o xmltest -I/usr/local/include/libxml2 -L/usr/local/lib -lxml2 xmltest.c
静态库的编译方式:
cc -o xmltest -lm -I/usr/local/include/libxml2 xmltest.c /usr/local/lib/libxml2.a
“-I/usr/local/include/libxml2”指定Libxml2库的头文件所在的路径,“-L/usr/local/lib”指定动态库所在路径。
-需要libxml2.a外,还要libz.a和-lm
支持GB2312的LIBXML2库源代码的修改
xmlSaveFileEnc( this->szConfigFile, this->m_doc, “GB2312” );
Libxml2是开源的xml解释器,使用中发现它不支持中文,于是自已加了些代码,支持了GB2312,改了如下代码
//encoding.c
xmlCharEncoding
xmlParseCharEncoding(const char* name)
{
const char *alias;
char upper[500];
int i;
if (name == NULL)
return(XML_CHAR_ENCODING_NONE);
///*
* Do the alias resolution
*/
alias = xmlGetEncodingAlias(name);
if (alias != NULL)
name = alias;
for (i = 0;i < 499;i++) {
upper[i] = toupper(name[i]);
if (upper[i] == 0) break;
}
upper[i] = 0;
if (!strcmp(upper, “”)) return(XML_CHAR_ENCODING_NONE);
if (!strcmp(upper, “UTF-8”)) return(XML_CHAR_ENCODING_UTF8);
if (!strcmp(upper, “UTF8”)) return(XML_CHAR_ENCODING_UTF8);
//中文化处理,让该XML分析器支持中文GB2312 BY JRuiui.NET 2005.12.23
if (!strcmp(upper, “GB2312”)) return (XML_CHAR_ENCODING_GB2312);
///*
* NOTE: if we were able to parse this, the endianness of UTF16 is
* already found and in use
*/
if (!strcmp(upper, “UTF-16”)) return(XML_CHAR_ENCODING_UTF16LE);
if (!strcmp(upper, “UTF16”)) return(XML_CHAR_ENCODING_UTF16LE);
if (!strcmp(upper, “ISO-10646-UCS-2”)) return(XML_CHAR_ENCODING_UCS2);
if (!strcmp(upper, “UCS-2”)) return(XML_CHAR_ENCODING_UCS2);
if (!strcmp(upper, “UCS2”)) return(XML_CHAR_ENCODING_UCS2);
///*
* NOTE: if we were able to parse this, the endianness of UCS4 is
* already found and in use
*/
if (!strcmp(upper, “ISO-10646-UCS-4”)) return(XML_CHAR_ENCODING_UCS4LE);
if (!strcmp(upper, “UCS-4”)) return(XML_CHAR_ENCODING_UCS4LE);
if (!strcmp(upper, “UCS4”)) return(XML_CHAR_ENCODING_UCS4LE);
if (!strcmp(upper, “ISO-8859-1”)) return(XML_CHAR_ENCODING_8859_1);
if (!strcmp(upper, “ISO-LATIN-1”)) return(XML_CHAR_ENCODING_8859_1);
if (!strcmp(upper, “ISO LATIN 1”)) return(XML_CHAR_ENCODING_8859_1);
if (!strcmp(upper, “ISO-8859-2”)) return(XML_CHAR_ENCODING_8859_2);
if (!strcmp(upper, “ISO-LATIN-2”)) return(XML_CHAR_ENCODING_8859_2);
if (!strcmp(upper, “ISO LATIN 2”)) return(XML_CHAR_ENCODING_8859_2);
if (!strcmp(upper, “ISO-8859-3”)) return(XML_CHAR_ENCODING_8859_3);
if (!strcmp(upper, “ISO-8859-4”)) return(XML_CHAR_ENCODING_8859_4);
if (!strcmp(upper, “ISO-8859-5”)) return(XML_CHAR_ENCODING_8859_5);
if (!strcmp(upper, “ISO-8859-6”)) return(XML_CHAR_ENCODING_8859_6);
if (!strcmp(upper, “ISO-8859-7”)) return(XML_CHAR_ENCODING_8859_7);
if (!strcmp(upper, “ISO-8859-8”)) return(XML_CHAR_ENCODING_8859_8);
if (!strcmp(upper, “ISO-8859-9”)) return(XML_CHAR_ENCODING_8859_9);
if (!strcmp(upper, “ISO-2022-JP”)) return(XML_CHAR_ENCODING_2022_JP);
if (!strcmp(upper, “SHIFT_JIS”)) return(XML_CHAR_ENCODING_SHIFT_JIS);
if (!strcmp(upper, “EUC-JP”)) return(XML_CHAR_ENCODING_EUC_JP);
#ifdef DEBUG_ENCODING
xmlGenericError(xmlGenericErrorContext, “Unknown encoding %s “, name);
#endif
return(XML_CHAR_ENCODING_ERROR);
}
const char*
xmlGetCharEncodingName(xmlCharEncoding enc) {
switch (enc) {
///*中文化处理GB2312编码*/
case XML_CHAR_ENCODING_GB2312:
return(“GB2312”);
case XML_CHAR_ENCODING_ERROR:
return(NULL);
case XML_CHAR_ENCODING_NONE:
return(NULL);
case XML_CHAR_ENCODING_UTF8:
return(“UTF-8”);
case XML_CHAR_ENCODING_UTF16LE:
return(“UTF-16”);
case XML_CHAR_ENCODING_UTF16BE:
return(“UTF-16”);
case XML_CHAR_ENCODING_EBCDIC:
return(“EBCDIC”);
case XML_CHAR_ENCODING_UCS4LE:
return(“ISO-10646-UCS-4”);
case XML_CHAR_ENCODING_UCS4BE:
return(“ISO-10646-UCS-4”);
case XML_CHAR_ENCODING_UCS4_2143:
return(“ISO-10646-UCS-4”);
case XML_CHAR_ENCODING_UCS4_3412:
return(“ISO-10646-UCS-4”);
case XML_CHAR_ENCODING_UCS2:
return(“ISO-10646-UCS-2”);
case XML_CHAR_ENCODING_8859_1:
return(“ISO-8859-1”);
case XML_CHAR_ENCODING_8859_2:
return(“ISO-8859-2”);
case XML_CHAR_ENCODING_8859_3:
return(“ISO-8859-3”);
case XML_CHAR_ENCODING_8859_4:
return(“ISO-8859-4”);
case XML_CHAR_ENCODING_8859_5:
return(“ISO-8859-5”);
case XML_CHAR_ENCODING_8859_6:
return(“ISO-8859-6”);
case XML_CHAR_ENCODING_8859_7:
return(“ISO-8859-7”);
case XML_CHAR_ENCODING_8859_8:
return(“ISO-8859-8”);
case XML_CHAR_ENCODING_8859_9:
return(“ISO-8859-9”);
case XML_CHAR_ENCODING_2022_JP:
return(“ISO-2022-JP”);
case XML_CHAR_ENCODING_SHIFT_JIS:
return(“Shift-JIS”);
case XML_CHAR_ENCODING_EUC_JP:
return(“EUC-JP”);
case XML_CHAR_ENCODING_ASCII:
return(NULL);
}
return(NULL);
}
//parserInternals.c
int
xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
if (ctxt->instate == XML_PARSER_EOF)
return(0);
if (ctxt->token != 0) {
*len = 0;
return(ctxt->token);
}
if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) {
*len = 1;
return((int) *ctxt->input->cur);
}
//中文化处理
if (!strcmp(ctxt->input->encoding,”GB2312″))
{
ctxt->charset = XML_CHAR_ENCODING_GB2312; //中文GB2312
*len = 1;
return((int) *ctxt->input->cur);
}
if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
///*
* We are supposed to handle UTF8, check it’s valid
* From rfc2044: encoding of the Unicode values on UTF-8:
*
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
* 0000 0000-0000 007F 0xxxxxxx
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
*
* Check for the 0x limit too
*/
const unsigned char *cur = ctxt->input->cur;
unsigned char c;
unsigned int val;
c = *cur;
if (c & 0x80) {
if (cur[1] == 0)
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
if ((cur[1] & 0xc0) != 0x80)
goto encoding_error;
if ((c & 0xe0) == 0xe0) {
if (cur[2] == 0)
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
if ((cur[2] & 0xc0) != 0x80)
goto encoding_error;
if ((c & 0xf0) == 0xf0) {
if (cur[3] == 0)
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
if (((c & 0xf8) != 0xf0) ||
((cur[3] & 0xc0) != 0x80))
goto encoding_error;
///* 4-byte code */
*len = 4;
val = (cur[0] & 0x7) << 18;
val |= (cur[1] & 0x3f) << 12;
val |= (cur[2] & 0x3f) << 6;
val |= cur[3] & 0x3f;
} else {
///* 3-byte code */
*len = 3;
val = (cur[0] & 0xf) << 12;
val |= (cur[1] & 0x3f) << 6;
val |= cur[2] & 0x3f;
}
} else {
///* 2-byte code */
*len = 2;
val = (cur[0] & 0x1f) << 6;
val |= cur[1] & 0x3f;
}
if (!IS_CHAR(val)) {
if ((ctxt->sax != NULL) &&
(ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
“Char 0x%X out of allowed range “, val);
ctxt->errNo = XML_ERR_INVALID_ENCODING;
ctxt->wellFormed = 0;
ctxt->disableSAX = 1;
}
return(val);
} else {
///* 1-byte code */
*len = 1;
if (*ctxt->input->cur == 0xD) {
if (ctxt->input->cur[1] == 0xA) {
ctxt->nbChars++;
ctxt->input->cur++;
}
return(0xA);
}
return((int) *ctxt->input->cur);
}
}
///*
* Assume it’s a fixed length encoding (1) with
* a compatibke encoding for the ASCII set, since
* XML constructs only use < 128 chars
*/
*len = 1;
if (*ctxt->input->cur == 0xD) {
if (ctxt->input->cur[1] == 0xA) {
ctxt->nbChars++;
ctxt->input->cur++;
}
return(0xA);
}
return((int) *ctxt->input->cur);
encoding_error:
///*
* If we detect an UTF8 error that probably mean that the
* input encoding didn’t get properly advertized in the
* declaration header. Report the error and switch the encoding
* to ISO-Latin-1 (if you don’t like this policy, just declare the
* encoding !)
*/
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
ctxt->sax->error(ctxt->userData,
“Input is not proper UTF-8, indicate encoding ! “);
ctxt->sax->error(ctxt->userData, “Bytes: 0x%02X 0x%02X 0x%02X 0x%02X “,
ctxt->input->cur[0], ctxt->input->cur[1],
ctxt->input->cur[2], ctxt->input->cur[3]);
}
ctxt->errNo = XML_ERR_INVALID_ENCODING;
ctxt->charset = XML_CHAR_ENCODING_8859_1;
*len = 1;
return((int) *ctxt->input->cur);
}
//encoding.h
typedef enum {
XML_CHAR_ENCODING_ERROR= -1, ///* No char encoding detected */
XML_CHAR_ENCODING_NONE= 0, ///* No char encoding detected */
XML_CHAR_ENCODING_UTF8= 1, ///* UTF-8 */
XML_CHAR_ENCODING_UTF16LE= 2, ///* UTF-16 little endian */
XML_CHAR_ENCODING_UTF16BE= 3, ///* UTF-16 big endian */
XML_CHAR_ENCODING_UCS4LE= 4, ///* UCS-4 little endian */
XML_CHAR_ENCODING_UCS4BE= 5, ///* UCS-4 big endian */
XML_CHAR_ENCODING_EBCDIC= 6, ///* EBCDIC uh! */
XML_CHAR_ENCODING_UCS4_2143=7, ///* UCS-4 unusual ordering */
XML_CHAR_ENCODING_UCS4_3412=8, ///* UCS-4 unusual ordering */
XML_CHAR_ENCODING_UCS2= 9, ///* UCS-2 */
XML_CHAR_ENCODING_8859_1= 10,///* ISO-8859-1 ISO Latin 1 */
XML_CHAR_ENCODING_8859_2= 11,///* ISO-8859-2 ISO Latin 2 */
XML_CHAR_ENCODING_8859_3= 12,///* ISO-8859-3 */
XML_CHAR_ENCODING_8859_4= 13,///* ISO-8859-4 */
XML_CHAR_ENCODING_8859_5= 14,///* ISO-8859-5 */
XML_CHAR_ENCODING_8859_6= 15,///* ISO-8859-6 */
XML_CHAR_ENCODING_8859_7= 16,///* ISO-8859-7 */
XML_CHAR_ENCODING_8859_8= 17,///* ISO-8859-8 */
XML_CHAR_ENCODING_8859_9= 18,///* ISO-8859-9 */
XML_CHAR_ENCODING_2022_JP= 19,///* ISO-2022-JP */
XML_CHAR_ENCODING_SHIFT_JIS=20,///* Shift_JIS */
XML_CHAR_ENCODING_EUC_JP= 21,///* EUC-JP */
XML_CHAR_ENCODING_ASCII= 22, ///* pure ASCII */
XML_CHAR_ENCODING_GB2312 = 23 ///*GB2312中文化处理*/
} xmlCharEncoding;
免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://haidsoft.com/142498.html