名称 | 值 | 说明 |
wd | 任意文字 | 关键字 |
rn | 可以不指定,默认为10,最大为50,最小为1,可设置为任意值 | 一页包含的结果条目数 |
pn | 百度默认显示760条,所以最后一页为pn=750 | 第一条结果的索引位置 |
在Qt中,使用QDomDocument 或 QXmlStreamReader 来解析 HTML 文件都失败了。经分析,其原因是:QDomDocument 或 QXmlStreamReader都是针对解析XML文件设计的。HTML与XML的区别
经过查找资料,TidyLib 库正好可以解决问题。包包
Tidy is a console application for Mac OS X, Linux, Windows, UNIX, and more. It corrects and cleans up HTML and XML documents by fixing markup errors and upgrading legacy code to modern standards.
is a C static and dynamic library that developers can integrate into their applications in order to bring all of Tidy’s power to your favorite tools. libtidy
is used today in desktop applications, web servers, and more.
TidyLib将HTML会修复文件可能的格式错误,并输出XHTML。XHTML格式符合XML规范,可以使用QDomDocument 或 QXmlStreamReader 来解析。也可以使用TidyLib库自带的解析函数提取想要的元素。
#ifndef HTMLPARSE_H #define HTMLPARSE_H #include <QDomDocument> class HtmlParse { public: HtmlParse(); bool setDatas(const QByteArray& datas); QList<QDomElement> getResults(); private: private: QDomDocument doc; }; #endif // HTMLPARSE_H /*********************************************************************************/ #include "htmlparse.h" #include <QDataStream> #include <QTextStream> #include <QDebug> #include "tidy.h" #include "tidybuffio.h" #include "tidyenum.h" #include "tidyplatform.h" #include "errno.h" #include <QStandardPaths> #include <QDir> #include <QDomDocument> #include <QRegularExpression> #include <QRegularExpressionMatch> HtmlParse::HtmlParse() { } bool HtmlParse::setDatas(const QByteArray &datas) { bool result = false; TidyBuffer output = {0}; TidyBuffer errbuf = {0}; int rc = -1; Bool ok; TidyDoc tdoc = tidyCreate(); // Initialize "document" ok = tidyOptSetBool( tdoc, TidyXhtmlOut, yes ); // Convert to XHTML if ( ok ) rc = tidySetErrorBuffer( tdoc, &errbuf ); // Capture diagnostics if ( rc >= 0 ) rc = tidyParseString( tdoc, datas.data() ); // Parse the input if ( rc >= 0 ) rc = tidyCleanAndRepair( tdoc ); // Tidy it up! if ( rc >= 0 ) rc = tidyRunDiagnostics( tdoc ); // Kvetch if ( rc > 1 ) // If error, force output. rc = ( tidyOptSetBool(tdoc, TidyForceOutput, yes) ? rc : -1 ); if ( rc >= 0 ) rc = tidySaveBuffer(tdoc, &output); // Pretty Print if ( rc >= 0 ) { if (doc.setContent(QByteArray((char *)output.bp))) { result = true; } } tidyBufFree( &output ); tidyBufFree( &errbuf ); tidyRelease( tdoc ); return result; } QList<QDomElement>& findResults(const QDomNode& pnode, const QString& tagName, const QHash<QString, QString>& validators, QList<QDomElement>& results) { QDomNode n = pnode.firstChild(); while (!n.isNull()) { if (n.isElement()) { // 递归,当前节点的子节点 findResults(n, tagName, validators, results); QDomElement elm = n.toElement(); // 需要检测tagName时,如果tagName不符合则跳过 if (!tagName.isEmpty() && elm.tagName() != tagName) { n = n.nextSibling(); continue; } // 取出当前节点的所有键值对 QHash<QString, QString> ha; auto attrs = elm.attributes(); for (int i = 0; i < attrs.count(); i++) { QDomAttr attr = attrs.item(i).toAttr(); ha.insert(attr.name(), attr.value()); } bool isValid = true; QHash<QString, QString>::const_iterator it = validators.begin(); while (it != validators.end()) { QHash<QString, QString>::const_iterator fi = ha.find(it.key()); if (fi == ha.end()) { isValid = false; break; } // 如果为空,则跳过 if (it.value().isEmpty()) { it++; continue; } QRegularExpression exp(it.value()); QRegularExpressionMatch mc = exp.match(fi.value()); if (!mc.hasMatch()) { isValid = false; break; } it++; } if (isValid) results.append(elm); } // 下一个兄弟节点 n = n.nextSibling(); } return results; } QList<QDomElement > HtmlParse::getResults() { QList<QDomElement> elements; QList<QDomElement> hrefElements; QHash<QString, QString> validators; validators.insert("class", "result"); validators.insert("id", "\d+"); validators.insert("srcid", "\d+"); findResults(doc, "div", validators, elements); qDebug() << elements.count(); for (auto var : elements) { qDebug() << var.attribute("id"); validators.clear(); validators.insert("href", ""); findResults(var, "a", validators, hrefElements); for (auto href : hrefElements) { qDebug() << href.text() << href.attribute("href"); } } return hrefElements; }