zoukankan      html  css  js  c++  java
  • 代理IP抓取

    针对http://www.youdaili.net/有代理IP上的国内IP进行了抓取,花了我两天的时间研究,解决了中文乱码的问题,成功的抓取到了IP和端口号,采用的是HtmlAgilityPack技术 下面贴出代码 ,如果需要下载请到http://download.csdn.net/detail/waiwai1015/9035015里,我上传了代码和库文件

    public class getProxyIp
    {
    //ScrapySharp
    private static string youdaili = "http://www.youdaili.net/";
    private static string hrefhead = youdaili + "Daili/guonei/";
    private static HtmlDocument getip(string url)
    {
    HtmlAgilityPack.HtmlWeb hw = new HtmlAgilityPack.HtmlWeb();//用的HtmlAgilityPack
    hw.AutoDetectEncoding = false;
    HttpWebRequest req;
    req = WebRequest.Create(new Uri(url)) as HttpWebRequest;
    req.Method = "GET";
    HttpWebResponse rs = (HttpWebResponse)req.GetResponse();
    System.IO.StreamReader sr = new StreamReader(rs.GetResponseStream(), System.Text.Encoding.GetEncoding("utf-8"));
    HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
    doc.LoadHtml(sr.ReadToEnd());
    return doc;
    }
    private static List<string> GetHrefs()
    {
    HtmlDocument _doc = getip(youdaili);
    List<string> IpUrlList = new List<string>();
    string todaydaili = DateTime.Now.ToString("MM月dd");
    HtmlNodeCollection hrefs = _doc.DocumentNode.SelectNodes(".//a");//ul/li/a
    if (hrefs == null)
    return null;
    foreach (HtmlNode href in hrefs)
    {
    if (href.Attributes["title"] != null && href.Attributes["href"] != null)
    {
    string tilte = href.Attributes["title"].Value;
    string urll = href.Attributes["href"].Value;
    if (tilte.IndexOf(todaydaili) >= 0 && urll.Length > 0)
    {
    if (urll.IndexOf("guonei") > 0)
    {
    IpUrlList.Add(urll);
    }
    //if (urll.IndexOf("guowai") > 0)
    //{
    // IpUrlList.Add(urll);//国外的自己加上
    //}
    }
    }
    }
    return IpUrlList;
    }
    private static List<string> GetIPhrefs()
    {

    List<string> IpHrefList = new List<string>();
    List<string> hrefs = GetHrefs();
    for (int j = 0; j < hrefs.Count; j++)
    {
    string str = string.Empty;
    string suburl = hrefs[j];
    IpHrefList.Add(suburl);
    string html = HttpGet(suburl);//这个页有乱码
    HtmlDocument doc = new HtmlDocument();
    doc.LoadHtml(html);
    // doc.Save("d:\3.html");
    //共几页
    HtmlNodeCollection hrefas = doc.DocumentNode.SelectNodes("//ul[@class='pagelist']//a");// <li><a>共3页: </a></li><li><a href='#'>上一页</a></li><li class="thisclass"><a href='#'>1</a></li><li><a href='3537_2.html'>2</a></li><li><a href='3537_3.html'>3</a></li><li><a href='3537_2.html'>下一页</a></li>

    if (hrefas != null && hrefas.Count >= 4)
    {
    for (int m = 3; m < hrefas.Count - 1; m++)
    {
    IpHrefList.Add(hrefhead+hrefas[m].Attributes["href"].Value);
    }
    }
    }
    return IpHrefList;

    }
    public static List<string> GetIPs()
    {
    try
    {
    List<string> hrefs = GetIPhrefs();
    List<string> IpList = new List<string>();
    for (int j = 0; j < hrefs.Count; j++)
    {
    string str = string.Empty;
    string suburl = hrefs[j];
    string html = HttpGet(suburl);
    HtmlDocument doc = new HtmlDocument();
    doc.LoadHtml(html);
    // doc.Save("d:\3.html");
    var artlist = doc.DocumentNode.SelectNodes("//p")[0];
    str = artlist.InnerHtml;
    string[] strsub = str.Split(' ');
    for (int i = 0; i < strsub.Length; i++)
    {
    string[] ips = strsub[i].Split('@');
    if (ips.Length > 1)
    {
    IpList.Add(ips[0]);
    }
    }
    }
    return IpList;
    }
    catch
    {
    return null;
    }
    }
    public static Encoding GetEncoding(string CharacterSet)
    {
    switch (CharacterSet)
    {
    case "gb2312": return Encoding.GetEncoding("gb2312");
    case "utf-8": return Encoding.UTF8;
    default: return Encoding.Default;
    }
    }
    public static string HttpGet(string url)
    {
    string responsestr = "";
    HttpWebRequest req = HttpWebRequest.Create(url) as HttpWebRequest;
    req.Accept = "*/*";
    req.Method = "GET";
    req.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1";
    using (HttpWebResponse response = req.GetResponse() as HttpWebResponse)
    {
    Stream stream;
    if (response.ContentEncoding.ToLower().Contains("gzip"))
    {
    stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress);
    }
    else if (response.ContentEncoding.ToLower().Contains("deflate"))
    {
    stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress);
    }
    else
    {
    stream = response.GetResponseStream();
    }
    using (StreamReader reader = new StreamReader(stream, GetEncoding(response.CharacterSet)))
    {
    responsestr = reader.ReadToEnd();
    stream.Dispose();
    }
    }
    return responsestr;
    }
    }

  • 相关阅读:
    D11 列表 list 元祖 字典dict
    D10 基本数据类型(各种职业的技能分析) 主要为 int 和 str
    Python D9 学习
    面向对象方法传参实现数组求和,求平均值
    用带参数的方法给空数组放元素,寻找数组里面的值是否存在。
    两种方法把类和对象写在同一个文件内
    创建一个管理员对象,输入正确用户名和密码,可以修改密码(类和对象分为两个文件,区别于放在一个文件内)
    创建一个游客对象,输入信息判断游客年龄是否免费游览
    建立一个学生对象,输出学生信息
    把输入的数字转为数组,拿出其中的最小值
  • 原文地址:https://www.cnblogs.com/waiwai1015/p/4750079.html
Copyright © 2011-2022 走看看