zoukankan      html  css  js  c++  java
  • 采集相关类

    using System;
    using System.Data;
    using System.Configuration;
    using System.Net;
    using System.IO;
    using System.Text;
    using System.Collections.Generic;
    using System.Text.RegularExpressions;
    using System.Threading;
    using System.Web;

    namespace Common
    {
        public class Utils
        {


           
            /// <summary>   
            /// 这私有方法从网页的HTML代码中分析出链接信息   
            /// </summary>   
            /// <returns>List<Link></returns>   
            public static List<string> getLinks(string html)   
                {       
                 List<string> m_links=new List<string>();
                 Uri m_uri = new Uri("http://www.suning.com/") ;//网址
                    if (m_links.Count == 0)       
                    {           
                        Regex[] regex = new Regex[2];           
                        regex[0] = new Regex("(?m)<a[^><]+href=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>(?<text>(file:///w://W)*?)</", RegexOptions.Multiline | RegexOptions.IgnoreCase);           
     
                            Match match = regex[0].Match(html);               
                            while (match.Success)               
                            {                   
                                try                   
                                {                       
                                    string url = new Uri(m_uri, match.Groups["url"].Value).AbsoluteUri;

                                    m_links.Add(url);                   
                                }                   
                                catch(Exception ex)
                                {
                                    Console.WriteLine(ex.Message);
                                };                   
                                match = match.NextMatch();               
                            }                
                    }       
                    return m_links;   
                }

            public static string GetHtml(string url)
            {
                System.Net.WebClient wc = new System.Net.WebClient();

                System.IO.Stream sm = wc.OpenRead(url); 

                 System.IO.StreamReader sr = new System.IO.StreamReader(sm, System.Text.Encoding.Default, true, 256000); 

                 string html = sr.ReadToEnd(); 

                 sr.Close();
                 return html;
            }

            /// <summary>
            /// 获得基础流
            /// </summary>
            /// <param name="uri">网址</param>
            /// <param name="cc">cookie容器,可以为NULL</param>
            /// <returns></returns>
            public static Stream GetBaseStream(string uri, CookieContainer cc)
            {
                try
                {
                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);   //用指定Uri创建一个request
                    if (cc != null)
                    {
                        request.CookieContainer = cc;
                    }
                    //浏览器欺骗
                    request.ContentType = "application/x-www-form-urlencoded";
                    request.Accept = @"application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
                    request.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.9 Safari/533.2 ChromePlus/1.3.9.0";

                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();      //根据创建的request得到响应response
                    Stream responseStream = response.GetResponseStream();  //创建一个流来获得响应体
                    return responseStream;
                }
                catch (Exception ex)
                {
                    //MessageBox.Show(@"操作失败:" + ex.Message);
                    return null;
                }
            }

            /// <summary>
            /// 获得网页
            /// </summary>
            /// <param name="uri">网址</param>
            /// <param name="postDate"></param>
            /// <param name="cc">cookie容器,可以为null</param>
            /// <param name="encoding">网页编码</param>
            /// <returns></returns>
            public static string GetHtmlString(string uri, string postDate, CookieContainer cc, Encoding encoding)
            {
                try
                {
                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
                    request.ContentType = "application/x-www-form-urlencoded";
                    request.AllowAutoRedirect = true;
                    request.Accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
                    request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.9 Safari/533.2 ChromePlus/1.3.9.0";
                    request.CookieContainer = cc;     //设置request产生cookie的容器
                    if (postDate != null)
                    {
                        request.Method = "Post";
                        byte[] byterequest = Encoding.UTF8.GetBytes(postDate);
                        request.ContentLength = byterequest.Length;
                        using (Stream stream = request.GetRequestStream())
                        {
                            stream.Write(byterequest, 0, byterequest.Length);
                        }

                    }

                    using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
                    {
                        using (Stream responsestream = response.GetResponseStream())
                        {
                            StreamReader sr = new StreamReader(responsestream, encoding);
                            string html = sr.ReadToEnd();
                            return html;
                        }
                    }
                }
                catch (Exception ex)
                {
                    //MessageBox.Show(@"发生错误:" + ex.Message);
                    return null;
                }
            }

            /// <summary>
            /// 从字符串中返回匹配多个的集合值(网页抽取特定部分有效)
            /// </summary>
            /// <param name="start">开始html tag</param>
            /// <param name="end">结束html tag</param>
            /// <param name="html">html</param>
            /// <returns></returns>
            public static List<string> GetStrings(string html,string start, string end)
            {
                List<string> list = new List<string>();
                try
                {
                    string pattern = string.Format("{0}(?<g>(.|[\r\n])+?){1}", start, end);//匹配URL的模式,并分组    //理解这个正则
                    MatchCollection mc = Regex.Matches(html, pattern);//满足pattern的匹配集合
                    if (mc.Count != 0)
                    {
                        foreach (Match match in mc)
                        {
                            GroupCollection gc = match.Groups;
                            list.Add(gc["g"].Value);
                        }
                    }
                }
                catch
                { }
                return list;
            }

        }
    }

  • 相关阅读:
    Hadoop命令手册
    编程算法
    综合8种子排序算法总结和比较
    android 创建一个新的每次project什么时候 请问自己主动 参加 V7依赖?
    【JDBC】java PreparedStatement操作oracle数据库
    【cocos2dx 加载资源目录】
    Project Euler:Problem 39 Integer right triangles
    矿Java开发学习之旅------&gt;Java排序算法经典的二分法插入排序
    [React Intl] Render Content with Placeholders using react-intl FormattedMessage
    [React Intl] Install and Configure the Entry Point of react-intl
  • 原文地址:https://www.cnblogs.com/andylaufzf/p/2099676.html
Copyright © 2011-2022 走看看