方案
1, 以 gbk 格式读取 xml 文件为字符串, 替换 xml 声明为 utf-8 格式
2, 将字符串编码成 utf-8 格式, 直接解析
代码
- def parse_xml_node(node):
- if len(node.getchildren()) == 0:
- return node.text if node.text is not None else ''
- else:
- node_dict = {}
- for child in node.getchildren():
- if child.tag in node_dict.keys():
- if not isinstance(node_dict[child.tag], list):
- node_dict[child.tag] = [node_dict[child.tag]]
- node_dict[child.tag].append(parse_xml_node(child))
- else:
- node_dict[child.tag] = parse_xml_node(child)
- return node_dict
- def parse_gbk_xml(filename):
- import codecs
- from xml.etree import ElementTree
- with codecs.open(filename,'r',encoding='gbk') as fp:
- text = fp.read().replace('<?xml version="1.0"encoding="GBK"?>', '<?xml version="1.0"encoding="UTF-8"?>')
- xdata = {}
- element = ElementTree.fromstring(text.encode('utf-8'))
- xdata[element.tag] = parse_xml_node(element)
结果验证:
- # 文本内容
- <?xml version="1.0" encoding="GBK"?>
- <root>
- <head>
- <code>1</code>
- <message > 正确 </message>
- <value>320202</value>
- </head>
- </root>
- # 解析结果
- {'root': {'head': {'message': u'\u6b63\u786e', 'code': '1', 'value': '320202'}}}