zoukankan      html  css  js  c++  java
  • 网页内容,图片及连接 抓取通用类(转)

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Web;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    using System.Collections;
    using System.IO.Compression;

    /// <summary>
    ///Name:网页抓取类
    ///Author:loafinweb
    ///Date:2011-09-12
    /// </summary>
    public class webCrawl
    {
    public webCrawl() { }

    //获取网页字符根据url
    public static string getHtml(string url)
    {
    try
    {
    string str = "";
    Encoding en
    = Encoding.GetEncoding(getEncoding(url));
    HttpWebRequest request
    = (HttpWebRequest)WebRequest.Create(url);
    request.Headers.Set(
    "Pragma", "no-cache");
    request.Timeout
    = 30000;
    HttpWebResponse response
    = (HttpWebResponse)request.GetResponse();
    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
    {
    Stream strM
    = response.GetResponseStream();
    StreamReader sr
    = new StreamReader(strM, en);
    str
    = sr.ReadToEnd();
    strM.Close();
    sr.Close();
    }
    return str;
    }
    catch
    {
    return String.Empty;
    }
    }

    //获取编码
    public static string getEncoding(string url)
    {
    HttpWebRequest request
    = null;
    HttpWebResponse response
    = null;
    StreamReader reader
    = null;
    try
    {
    request
    = (HttpWebRequest)WebRequest.Create(url);
    request.Timeout
    = 30000;
    request.AllowAutoRedirect
    = false;

    response
    = (HttpWebResponse)request.GetResponse();
    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
    {
    if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
    reader
    = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
    else
    reader
    = new StreamReader(response.GetResponseStream(), Encoding.ASCII);

    string html = reader.ReadToEnd();

    Regex reg_charset
    = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
    if (reg_charset.IsMatch(html))
    {
    return reg_charset.Match(html).Groups["charset"].Value;
    }
    else if (response.CharacterSet != string.Empty)
    {
    return response.CharacterSet;
    }
    else
    return Encoding.Default.BodyName;
    }
    }
    catch (Exception ex)
    {
    throw new Exception(ex.Message);
    }
    finally
    {
    if (response != null)
    {
    response.Close();
    response
    = null;
    }
    if (reader != null)
    reader.Close();

    if (request != null)
    request
    = null;
    }
    return Encoding.Default.BodyName;
    }

    //根据内容--获取标题
    public static string getTitle(string url)
    {
    string title = string.Empty;
    string htmlStr = getHtml(url);//获取网页
    Match TitleMatch = Regex.Match(htmlStr, "<title>([^<]*)</title>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
    title
    = TitleMatch.Groups[1].Value;
    title
    = Regex.Replace(title, @"\W", "");//去除空格
    return title;

    }

    //根据内容--获取描述信息
    public static string getDescription(string url)
    {
    string htmlStr = getHtml(url);
    Match Desc
    = Regex.Match(htmlStr, "<meta name=\"Description\" content=\"([^<]*)\"*>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
    string mdd = Desc.Groups[1].Value;
    return Regex.Replace(Desc.Groups[1].Value, @"\W", "");
    }


    //根据内容--获取所有链接
    public static List<string> getLink(string htmlStr)
    {
    List
    <string> list = new List<string>(); //用来存放链接
    String reg = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; //链接的正则表达式
    Regex regex = new Regex(reg, RegexOptions.IgnoreCase);
    MatchCollection mc
    = regex.Matches(htmlStr);
    for (int i = 0; i < mc.Count; i++) //存放匹配的集合
    {
    bool hasExist = false; //链接存在与否的标记
    String name = mc[i].ToString();
    foreach (String one in list)
    {
    if (name == one)
    {
    hasExist
    = true; //链接已存在
    break;
    }
    }
    if (!hasExist) list.Add(name); //链接不存在,添加
    }
    return list;

    }

    //根据内容--取得body内的内容
    public static string getBody(string url)
    {
    string htmlStr = getHtml(url);
    string result = string.Empty;
    Regex regBody
    = new Regex(@"(?is)<body[^>]*>(?:(?!</?body\b).)*</body>");
    Match m
    = regBody.Match(htmlStr);
    if (m.Success)
    {
    result
    = parseHtml(m.Value);
    }
    return result;
    }

    //获取所有图片
    public static List<string> getImg(string url)
    {
    List
    <string> list = new List<string>();
    string temp = string.Empty;
    string htmlStr = getHtml(url);
    MatchCollection matchs
    = Regex.Matches(htmlStr, @"<(IMG|img)[^>]+>"); //抽取所有图片
    for (int i = 0; i < matchs.Count; i++)
    {
    list.Add(matchs[i].Value);
    }
    return list;
    }

    //所有图片路径(如果是相对路径的话,自动设置成绝对路径)
    public static List<string> getImgPath(string url)
    {
    List
    <string> list = new List<string>();
    string htmlStr = getHtml(url);
    string pat = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>";
    MatchCollection matches
    = Regex.Matches(htmlStr, pat, RegexOptions.IgnoreCase | RegexOptions.Multiline);
    foreach (Match m in matches)
    {
    string imgPath = m.Groups["imgUrl"].Value.Trim();
    if (Regex.IsMatch(imgPath, @"\w+\.(gif|jpg|bmp|png)$")) //用了2次匹配,去除链接是网页的 只留图片
    {
    if (!imgPath.Contains("http"))//必须包含http 否则无法下载
    {
    imgPath
    = getUrl(url) + imgPath;
    }
    list.Add(imgPath);
    }
    }
    return list;
    }

    //下载图片
    public void DownloadImg(string fileurl)
    {
    if (fileurl.Contains('.'))//url路径必须是绝对路径 例如http://xxx.com/img/logo.jpg
    {
    string imgName = DateTime.Now.ToString("yyyyMMddHHmmssffff") + fileurl.Substring(fileurl.LastIndexOf('.')); // 生成图片的名字
    string filepath = System.Web.HttpContext.Current.Server.MapPath("") + "/" + imgName;
    WebClient mywebclient
    = new WebClient();
    mywebclient.DownloadFile(fileurl, filepath);
    }
    }

    //过滤html
    public static string parseHtml(string html)
    {
    string value = Regex.Replace(html, "<[^>]*>", string.Empty);
    value
    = value.Replace("<", string.Empty);
    value
    = value.Replace(">", string.Empty);
    //return value.Replace("&nbsp;", string.Empty);

    return Regex.Replace(value, @"\s+", "");
    }

    //处理url路径问题
    public static string getUrl(string url)
    {
    //如果是http://www.xxx.com 返回http://www.xxx.com/
    //如果是http://www.xxx.com/art.aspx 返回http://www.xxx.com/
    return url = url.Substring(0, url.LastIndexOf('/')) + "/";
    }
    }
  • 相关阅读:
    序列化
    python_模块与包
    python_常用内置模块
    python_生成器
    python_文件操作
    你好,mysql
    2017年12月20日 内置对象
    2017年12月17日 ASP.NET 12个表单元素&&简单控件/复合控件
    2017年12月16日 ASP.NET基本用法
    2017年12月14日 LinQ高级查&&Asp.net WebForm Asp.net MVC
  • 原文地址:https://www.cnblogs.com/zhang9418hn/p/2175173.html
Copyright © 2011-2022 走看看