zoukankan      html  css  js  c++  java
  • 利用SgmlReader获取网页源代码,进行提取

    1根据sgmlReader类获得完整的html代码

       /// <summary>
            /// 读取html页面内容
            /// </summary>
            /// <param name="uri">网址</param>
            /// <returns></returns>
            private string GetWellFormedHTML(string uri)
            {
                StreamReader sReader = null;//读取字节流
                StringWriter sw = null;//写入字符串
                SgmlReader reader = null;//sgml读取方法
                XmlTextWriter writer = null;//生成xml数据流
                try
                {
                    if (uri == String.Empty)
                        return null;
                    WebClient webclient = new WebClient();
                    webclient.Encoding = Encoding.UTF8;
                    //页面内容
                    string strWebContent = webclient.DownloadString(uri);
    
    
                    reader = new SgmlReader();
                    reader.DocType = "HTML";
                    reader.InputStream = new StringReader(strWebContent);
    
    
                    sw = new StringWriter();
                    writer = new XmlTextWriter(sw);
                    writer.Formatting = System.Xml.Formatting.Indented;
                    while (reader.Read())
                    {
                        if (reader.NodeType != XmlNodeType.Whitespace)
                        {
                            writer.WriteNode(reader, true);
                        }
                    }
                    return sw.ToString();
    
                }
                catch (Exception exp)
                {
                    writer.Close();
                    reader.Close();
                    sw.Close();
                    sReader.Close();
                    return exp.Message;
                }
            }
    View Code

    2根据xpath规则,进行查找

      /// <summary>
            /// 加载html源码,根据xpath规则查找所需内容
            /// </summary>
            /// <param name="htmlStr">源码</param>
            /// <param name="xpath">xpath规则</param>
            /// <returns>查询结果</returns>
            private string GetResult(string htmlStr, string xpath)
            {
                StringBuilder sb = new StringBuilder();//存储结果
                XPathDocument doc = new XPathDocument(new StringReader(htmlStr));//记载文件
                XPathNavigator nav = doc.CreateNavigator();//产生节点
                XPathNodeIterator nodes = nav.Select(xpath);//需找目标
                while (nodes.MoveNext())
                {
                    XPathNavigator navCon = nodes.Current;
                    sb.AppendLine(navCon.InnerXml);//获取全部内容(包含属性等)
                    sb.AppendLine(navCon.Value);//获取值(不包含属性等)
                }
                return sb.ToString();
            }
    View Code

    完!

  • 相关阅读:
    SDN 实验室学生们
    面向对象程序设计
    软件工程实践
    走出舒适圈的信念和勇气——“Learning by doing!” 我的软工2020春季教学总结
    第二次作业(2)
    结对编程第一战——“停课不停学”数据可视化的数据采集
    团队作业第四次—项目系统设计与数据库设计
    团队作业第一次—团队展示
    软件工程实践2019第五次作业——结对编程的编程实现
    软件工程实践2019——idea表述及组队
  • 原文地址:https://www.cnblogs.com/wwz-wwz/p/7551477.html
Copyright © 2011-2022 走看看