zoukankan      html  css  js  c++  java
  • asp.net mvc抓取微信文章里面所有的图片

    /// <summary>
        /// 下载指定URL下的所有图片
        /// </summary>
        public class WebPageImage
        {
            /// <summary>
            /// 获取网页中全部图片
            /// </summary>
            /// <param name="url">网页地址</param>
            /// <param name="charSet">网页编码,为空自动判断</param>
            /// <returns>全部图片显示代码</returns>
            public string getImages(string url, string charSet)
            {
                string s = getHtml(url, charSet);
                return getPictures(s, url);
            }
    
            /// <summary>
            /// 获取网页中全部图片
            /// </summary>
            /// <param name="url">网址</param>
            /// <returns>全部图片代码</returns>
            public string getImages(string url)
            {
                return getImages(url, "");
            }
    
            string doman(string url)
            {
                Uri u = new Uri(url);
                return u.Host;
            }
    
            /// <summary>
            /// 获取网页内容
            /// </summary>
            /// <param name="url">网站地址</param>
            /// <param name="charSet">目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 </param>
            /// <returns></returns>
            string getHtml(string url, string charSet)
            {
                WebClient myWebClient = new WebClient();
                //创建WebClient实例myWebClient 
                // 需要注意的: 
                //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 
                //这是就要具体问题具体分析比如在头部加入cookie 
                // webclient.Headers.Add("Cookie", cookie); 
                //这样可能需要一些重载方法。根据需要写就可以了 
    
                //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 
                myWebClient.Credentials = CredentialCache.DefaultCredentials;
                //如果服务器要验证用户名,密码 
                //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 
                //myWebClient.Credentials = mycred; 
                //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 
                byte[] myDataBuffer = myWebClient.DownloadData(url);
                string strWebData = Encoding.Default.GetString(myDataBuffer);
    
                //获取网页字符编码描述信息 
                Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                string webCharSet = charSetMatch.Groups[2].Value.Replace(""", "");
                if (charSet == null || charSet == "")
                    charSet = webCharSet;
    
                if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
                    strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
                return strWebData;
            }
    
            string getPictures(string data, string url)
            {
                MatchCollection ps = Regex.Matches(data, @"<img[^<>]*?src[s	
    ]*=[s	
    ]*[""']?[s	
    ]*(?<imgUrl>[^s	
    ""'<>]*)[^<>]*?/?[s	
    ]*>");
                string s = string.Empty;
                for (int i = 0; i < ps.Count; i++)
                {
                    pictures p = new pictures(ps[i].Value, url);
                    s += p.GetHtml + "<br />" + Environment.NewLine;
                }
                return s;
            }
    
            /// <summary>
            /// 图片实体
            /// 图片文件属性处理类
            /// </summary>
            public class pictures
            {
                public pictures(string strHtml, string baseUrl)
                {
                    _html = strHtml;
                    Uri u1 = new Uri(baseUrl);
                    _doman = u1.Host;
                    _baseUrl = u1.Scheme + "://" + _doman;
                    setSrc();
                }
    
                private string _html = string.Empty;
                private string _baseUrl = string.Empty;
                private string _doman = string.Empty;
    
                public string GetHtml
                {
                    get { return _html; }
                }
    
                public string Alt
                {
                    get
                    {
                        return GetAttribute("alt")[0];
                    }
                }
    
                public string Src
                {
                    get
                    {
                        string s = GetAttribute("src")[0];
                        return s;
                    }
                }
    
                /// <summary>
                /// 根据基路径把相对路径转换成绝对径
                /// </summary>
                /// <param name="baseUrl">基础路径</param>
                /// <param name="u">待转换的相对路径</param>
                /// <returns>绝对路径</returns>
                public string absUrl(string baseUrl, string u)
                {
                    Uri ub = new Uri(baseUrl);
                    Uri ua = new Uri(ub, u);
                    return ua.AbsoluteUri;
                }
    
                private void setSrc()
                {
                    string strPattern = @"src[s	
    ]*=[s	
    ]*[""']?S+[""']?";
                    string src = GetAttribute("src")[0].ToLower();
                    if (!(src.IndexOf("http://") == 0 || src.IndexOf("https://") == 0) && _baseUrl.Length > 10)
                    {
                        src = absUrl(_baseUrl, src);
                        string s = "src="" + src + """;
                        _html = Regex.Replace(_html, strPattern, s);
                    }
                }
    
                /// <summary>
                /// 获取HTML代码中标签属性
                /// </summary>
                /// <param name="strHtml">HTML代码</param>
                /// <param name="strAttributeName">属性名称</param>
                /// <returns>属性值集合</returns>
                private string[] GetAttribute(string strAttributeName)
                {
                    List<string> lstAttribute = new List<string>();
                    string strPattern = string.Format(
                      @"{0}[s	
    ]*=[s	
    ]*[""']?S+[""']?",
                      strAttributeName
                      );
                    MatchCollection matchs = Regex.Matches(_html, strPattern, RegexOptions.IgnoreCase);
                    foreach (Match m in matchs)
                    {
                        lstAttribute.Add(m.Value.Split('=')[1].Replace(""", "").Replace("'", ""));
                    }
                    if (lstAttribute.Count == 0) lstAttribute.Add("");
                    return lstAttribute.ToArray();
                }
    
            }
    
            /// <summary> 
            /// 取得HTML中所有图片的 URL。 
            /// </summary> 
            /// <param name="sHtmlText">HTML代码</param> 
            /// <returns>图片的URL列表</returns> 
            public string[] GetHtmlImageUrlList(string sHtmlText)
            {
                // 定义正则表达式用来匹配 img 标签 
                Regex regImg = new Regex(@"<img[^<>]*?src[s	
    ]*=[s	
    ]*[""']?[s	
    ]*(?<imgUrl>[^s	
    ""'<>]*)[^<>]*?/?[s	
    ]*>", RegexOptions.IgnoreCase);
    
                // 搜索匹配的字符串 
                MatchCollection matches = regImg.Matches(sHtmlText);
                int i = 0;
                string[] sUrlList = new string[matches.Count];
    
                // 取得匹配项列表 
                foreach (Match match in matches)
                    sUrlList[i++] = match.Groups["imgUrl"].Value;
                return sUrlList;
            }
    
    
        }

    调用的自己设定修改

    /// <summary>
            /// 一键下载
            /// </summary>
            /// <param name="url">url地址</param>
            /// <returns></returns>
            [HttpPost]
            public ActionResult ImgDow(string url)
            {
                //System.Drawing.Bitmap img = null;
                HttpWebRequest req;
                HttpWebResponse res = null;
                try
                {
                    Common.WebPageImage model = new WebPageImage();
                    string v = model.getImages(url, "");
                    string[] Arrt = model.GetHtmlImageUrlList(v);
                    foreach (var item in Arrt)
                    {
                        string[] file = item.Split('/');
                        if(file.Count()>4)
                        {
                            //string name = string.IsNullOrEmpty(System.IO.Path.GetFileName(file[0])) ? DateTime.Now.ToFileTime().ToString() : System.IO.Path.GetFileName(file[0]);
                            //System.Uri httpUrl = new System.Uri(item);
                            //req = (HttpWebRequest)(WebRequest.Create(httpUrl));
                            //req.Timeout = 180000; //设置超时值10秒
                            //req.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*";
                            //req.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
                            //req.Method = "GET";
                            //res = (HttpWebResponse)(req.GetResponse());
                            //Stream strea = res.GetResponseStream();
                            //img = new Bitmap(strea);//获取图片流
                            //string[] imgGS = file[3].Split('_');
                            //switch (imgGS[1])
                            //{
                            //    case "gif":
                            //        img.Save(@"E:/" + DateTime.Now.ToFileTime().ToString() + ".gif", ImageFormat.Gif);
                            //        break;
                            //    case "jpg":
                            //        img.Save(@"E:/" + DateTime.Now.ToFileTime().ToString() + ".jpg");
                            //        break;
                            //    case "png":
                            //        img.Save(@"E:/" + DateTime.Now.ToFileTime().ToString() + ".png");
                            //        break;
                            //    default:
                            //        img.Save(@"E:/" + DateTime.Now.ToFileTime().ToString() + ".jpg");
                            //        break;
                            //}
                            WebClient my = new WebClient();
                            byte[] mybyte;
                            mybyte = my.DownloadData(item); 
                            MemoryStream ms = new MemoryStream(mybyte);
                            System.Drawing.Image img;
                            img = System.Drawing.Image.FromStream(ms);
                            img.Save(@"E:/" + DateTime.Now.ToFileTime().ToString() + ".gif", ImageFormat.Gif); //保存
                            System.Threading.Thread.Sleep(1000);
                        }
                    }
    
                    return WriteSuccess("1");
                }
                catch (Exception ex)
                {
                    return WriteError(ex.Message);
                }
                finally
                {
                    //res.Close();
                }
            }
  • 相关阅读:
    vmware 安装提示the msi failed
    答辩修改记录
    科研系统修改记录
    python2.7学习记录之四
    sql语句--查询语句(MySQL)
    lei muban
    共模与差模的区别是什么?
    linux pinmux 引脚多路复用驱动分析与使用
    纯虚函数
    内核与驱动文件的version magic匹配问题
  • 原文地址:https://www.cnblogs.com/MingQiu/p/6652826.html
Copyright © 2011-2022 走看看