zoukankan      html  css  js  c++  java
  • C# 抓取网页的img src带参数的图片链接,并下载

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Threading;
    using System.Windows.Forms;
    
    namespace ImageCollection
    {
        public partial class Form1 : Form
        {
            private static string Path = AppDomain.CurrentDomain.BaseDirectory + "img";
            public Form1()
            {
                InitializeComponent();
            }
    
            private void btnshuaqu_Click(object sender, EventArgs e)
            {
                string url = txturl.Text.Trim();
                if (string.IsNullOrEmpty(url))
                {
                    MessageBox.Show("请输入URl");
                    return;
                }
                txtimg.AppendText("开始抓取中:
    ");
                Thread th = new Thread(() => ShuaQu(url)) { IsBackground = true };
                th.Start();
            }
    
            private void ShuaQu(string url)
            {
                DirectoryInfo di = new DirectoryInfo(Path);
                if (System.IO.Directory.Exists(Path))
                {
                    di.Delete(true);
                }
                System.IO.Directory.CreateDirectory(Path);
                string result = WebHttp.HttpGet(url, null, 3);
                string[] str = GetHtmlImageUrlList(result);
                txtimg.Invoke(new Action(() =>
                {
                    txtimg.AppendText("已经获取到数据!"+str.Count() + "
    ");
                }));
                //建立获取网页标题正则表达式  
                String regex = @"<title>.+</title>";
    
                //返回网页标题  
                String title = Regex.Match(result, regex).ToString();
                txttitle.Invoke(new Action(() => {
                    txttitle.Text = Regex.Replace(title, @"[""]+", ""); 
                }));
                foreach (string s in str)
                {
                    Uri u = new Uri(s);
                    if (u.Host == "www.xxx.com")
                    {
                        Thread downimg = new Thread(() => Get_img(s)) { IsBackground = true };
                        downimg.Start();
                        txtimg.Invoke(new Action(() => {
                            txtimg.AppendText(s + "
    ");
                        }));
                    } 
                }
                txtimg.Invoke(new Action(() =>
                {
                    txtimg.AppendText("全部抓取完成!
    ");
                }));
            }
    
            public void Get_img(string imgpath)
            {
                
    
                string[] file = imgpath.Split('?');
                string name = System.IO.Path.GetFileName(file[0]);
                WebClient mywebclient = new WebClient();
                mywebclient.DownloadFile(imgpath, Path + @"" + name);
                //Bitmap img = null;
                //HttpWebRequest req;
                //HttpWebResponse res = null;
                //try
                //{
                //    System.Uri httpUrl = new System.Uri(imgpath);
                //    req = (HttpWebRequest)(WebRequest.Create(httpUrl));
                //    req.Timeout = 180000; //设置超时值10秒
                //    req.UserAgent = "XXXXX";
                //    req.Accept = "XXXXXX";
                //    req.Method = "GET";
                //    res = (HttpWebResponse)(req.GetResponse());
                //    img = new Bitmap(res.GetResponseStream());//获取图片流                
                //    img.Save(Path + @""+name);//随机名
                //}
    
                //catch (Exception ex)
                //{
                //    string aa = ex.Message;
                //}
                //finally
                //{
                //    res.Close();
                //}
            }
    
    
            /// <summary> 
            /// 取得HTML中所有图片的 URL。 
            /// </summary> 
            /// <param name="sHtmlText">HTML代码</param> 
            /// <returns>图片的URL列表</returns> 
            private string[] GetHtmlImageUrlList(string sHtmlText)
            {
                // 定义正则表达式用来匹配 img 标签 
                Regex regImg = new Regex(@"<img[^<>]*?src[s	
    ]*=[s	
    ]*[""']?[s	
    ]*(?<imgUrl>[^s	
    ""'<>]*)[^<>]*?/?[s	
    ]*>", RegexOptions.IgnoreCase);
    
                // 搜索匹配的字符串 
                MatchCollection matches = regImg.Matches(sHtmlText);
                int i = 0;
                string[] sUrlList = new string[matches.Count];
    
                // 取得匹配项列表 
                foreach (Match match in matches)
                    sUrlList[i++] = match.Groups["imgUrl"].Value;
                return sUrlList;
            }
        }
    }
    #region 下载图片到Image
    public static Image UrlToImage(string url) {
        WebClient mywebclient = new WebClient();
        byte[] Bytes = mywebclient.DownloadData(url);
        using (MemoryStream ms = new MemoryStream(Bytes)) {
            Image outputImg = Image.FromStream(ms);
            return outputImg;
        }
    }
    #endregion
  • 相关阅读:
    20200807日报
    20200806日报
    《大道至简》读书感悟
    20200805日报
    20200804日报
    20200803日报
    20200802日报
    vue中mounted内如何调完异步方法再渲染
    小程序画布识别iPhone11
    np.meshgrid() 生成网格坐标函数
  • 原文地址:https://www.cnblogs.com/testsec/p/6095851.html
Copyright © 2011-2022 走看看