zoukankan      html  css  js  c++  java
  • 利用SgmlReader获取网页源代码,进行提取

    1根据sgmlReader类获得完整的html代码

       /// <summary>
            /// 读取html页面内容
            /// </summary>
            /// <param name="uri">网址</param>
            /// <returns></returns>
            private string GetWellFormedHTML(string uri)
            {
                StreamReader sReader = null;//读取字节流
                StringWriter sw = null;//写入字符串
                SgmlReader reader = null;//sgml读取方法
                XmlTextWriter writer = null;//生成xml数据流
                try
                {
                    if (uri == String.Empty)
                        return null;
                    WebClient webclient = new WebClient();
                    webclient.Encoding = Encoding.UTF8;
                    //页面内容
                    string strWebContent = webclient.DownloadString(uri);
    
    
                    reader = new SgmlReader();
                    reader.DocType = "HTML";
                    reader.InputStream = new StringReader(strWebContent);
    
    
                    sw = new StringWriter();
                    writer = new XmlTextWriter(sw);
                    writer.Formatting = System.Xml.Formatting.Indented;
                    while (reader.Read())
                    {
                        if (reader.NodeType != XmlNodeType.Whitespace)
                        {
                            writer.WriteNode(reader, true);
                        }
                    }
                    return sw.ToString();
    
                }
                catch (Exception exp)
                {
                    writer.Close();
                    reader.Close();
                    sw.Close();
                    sReader.Close();
                    return exp.Message;
                }
            }
    View Code

    2根据xpath规则,进行查找

      /// <summary>
            /// 加载html源码,根据xpath规则查找所需内容
            /// </summary>
            /// <param name="htmlStr">源码</param>
            /// <param name="xpath">xpath规则</param>
            /// <returns>查询结果</returns>
            private string GetResult(string htmlStr, string xpath)
            {
                StringBuilder sb = new StringBuilder();//存储结果
                XPathDocument doc = new XPathDocument(new StringReader(htmlStr));//记载文件
                XPathNavigator nav = doc.CreateNavigator();//产生节点
                XPathNodeIterator nodes = nav.Select(xpath);//需找目标
                while (nodes.MoveNext())
                {
                    XPathNavigator navCon = nodes.Current;
                    sb.AppendLine(navCon.InnerXml);//获取全部内容(包含属性等)
                    sb.AppendLine(navCon.Value);//获取值(不包含属性等)
                }
                return sb.ToString();
            }
    View Code

    完!

  • 相关阅读:
    Busybox制作ARM(iTOP4412) 根文件系统
    01.高并发底层原理
    设计模式
    高并发实战
    # 记一次shell编写
    shell if条件语句
    scrapy使用
    整理JAVA知识点--基础篇,能力有限不足地方请大神们帮忙完善下
    mybatis-generator使用
    优先级队列实现
  • 原文地址:https://www.cnblogs.com/wwz-wwz/p/7551477.html
Copyright © 2011-2022 走看看