参考:
对XML文件读取和编辑2-QXmlStreamReader读取 - 知乎
https://zhuanlan.zhihu.com/p/358862429
本地环境:
win10专业版,64位,Qt 5.12
代码已测试通过。
问题描述
需要按字节读取一个文档,解析其中具有xml格式的部分,并存储到一个Hash表中,方便使用。xml标签上可能带有属性信息,如下图红线所示:
解决思路
按字节读取参考:
qt5-入门-QByteArray-CSDN博客
https://blog.csdn.net/pxy7896/article/details/137583591
提取后发现文字内容大概如下:
"<Notes><UUID>71bf0eb6-0477-41e8-8520-f1f5fafac932</UUID><Type>Synthetic</Type><ConfirmedExperimentally>0</ConfirmedExperimentally><CustomMapLabel>Y14837</CustomMapLabel><UseCustomMapLabel>1</UseCustomMapLabel><Description>Cloning vector pUC57, complete sequence.</Description><Created UTC=\"1:41:49\">2020.7.2</Created><LastModified UTC=\"1:8:0\">2021.10.8</LastModified><AccessionNumber>Y14837</AccessionNumber><SequenceClass>UNA</SequenceClass><TransformedInto>unspecified</TransformedInto><References><Reference authors=\"Markausakas A, Dreguniene G.\" journal=\"Unpublished\" title=\"A new cloning vector pUC57\"/><Reference authors=\"Markauskas A.\" journal=\"Submitted (16-SEP-1997) A. Markauskas, Fermentas AB, Graiciuno 8, Vilnius 2028, LITHUANIA\" title=\"Direct Submission\"/></References><Comments><a href='http://www.informaxinc.com/'>http://www.informaxinc.com/</a><br>ORIGDB|GenBank</Comments></Notes>"
可以看到,<Reference>
不仅携带属性信息,还可能重复,所以应该使用QMultiHash
。
总的设计思路是:当QXmlStreamReader::TokenType
是QXmlStreamReader::StartElement
读取标签名称和属性信息,存储到合适的字典里;当是QXmlStreamReader::Characters
时读取标签内容;当是QXmlStreamReader::EndElement
时,存储到外层字典中,并清空临时值。这样一直读取到这部分结束。
实现
void process(QXmlStreamReader& xml, QMultiHash<QString, QHash<QString, QString>>& hash) {// 临时存储QString name, value;QHash<QString, QString> attrHash;while(!xml.atEnd()) {QXmlStreamReader::TokenType token = xml.readNext();switch ((int)token) {case QXmlStreamReader::NoToken://qDebug()<<"没有读到任何东西";break;case QXmlStreamReader::Invalid://qDebug()<<"发生错误,在error()和errorString()中报告.";break;case QXmlStreamReader::StartDocument://qDebug()<<"读取文件开始-"<<"版本号:"<<xml.documentVersion()<<"编码格式:"<<xml.documentEncoding();break;case QXmlStreamReader::EndDocument://qDebug()<<"读取文件结束";break;case QXmlStreamReader::StartElement: //开始读取一个元素{// 如果是元素开始标签name = xml.name().toString();// 输出标签的属性QXmlStreamAttributes attributes = xml.attributes();// 此时有属性,需要填充字典if (!attributes.isEmpty()) {foreach (const QXmlStreamAttribute &attribute, attributes) {attrHash.insert(attribute.name().toString(), attribute.value().toString());}}}break;case QXmlStreamReader::EndElement: //读取一个元素结束{if(name == xml.name().toString()) {//attrHash.insert("name", name);attrHash.insert("value", value);hash.insert(name, attrHash);}// 清空name = "";value = "";attrHash.clear();}break;case QXmlStreamReader::Characters: //读取元素中的文本信息{QString str = xml.text().toString();if(!xml.isWhitespace()){value = str;}}break;case QXmlStreamReader::Comment: //文本注释break;case QXmlStreamReader::ProcessingInstruction://qDebug()<<"ProcessingInstruction: "<< xml.text();break;}} // 读取结束
}
使用:
QString blockContent = byteArray.mid(ptr, blockSize);
// 原始bytes中可能有\n,注意去掉。。。
QXmlStreamReader xml(blockContent);
QMultiHash<QString, QHash<QString, QString> > curHash;
process(xml, curHash);
// 打印一下结果
for (QMultiHash<QString, QHash<QString, QString>>::const_iterator it = curHash.constBegin(); it != curHash.constEnd(); ++it) {qDebug() << it.key() << it.value() << endl;
}/* 解析结果
"Created" QHash(("value", "2020.7.2")("UTC", "1:41:49")) "LastModified" QHash(("value", "2021.10.8")("UTC", "1:8:0")) "Comments" QHash(("value", "<a href='http://www.informaxinc.com/'>http://www.informaxinc.com/</a><br>ORIGDB|GenBank")) "Type" QHash(("value", "Synthetic")) "Description" QHash(("value", "Cloning vector pUC57, complete sequence.")) "CustomMapLabel" QHash(("value", "Y14837")) "UseCustomMapLabel" QHash(("value", "1")) "ConfirmedExperimentally" QHash(("value", "0")) "SequenceClass" QHash(("value", "UNA")) "UUID" QHash(("value", "71bf0eb6-0477-41e8-8520-f1f5fafac932")) "TransformedInto" QHash(("value", "unspecified")) "Reference" QHash(("value", "")("journal", "Submitted (16-SEP-1997) A. Markauskas, Fermentas AB, Graiciuno 8, Vilnius 2028, LITHUANIA")("authors", "Markauskas A.")("title", "Direct Submission")) "Reference" QHash(("value", "")("journal", "Unpublished")("authors", "Markausakas A, Dreguniene G.")("title", "A new cloning vector pUC57")) "AccessionNumber" QHash(("value", "Y14837"))
*/