zoukankan      html  css  js  c++  java
  • C#写爬虫,版本V2.1

      这次是对2.0的小修补,2.0交互几乎没有,这次添加了进度条,和文本框,同时由于取得的链接主要会出现错误是:webResponse错误。

    针对这种情况,设置了

     try
                    {
                        webResponse = (HttpWebResponse)webRequest.GetResponse();
                    }
                    catch(WebException ex)
                    {
                        webResponse = (HttpWebResponse)ex.Response;
                    }

    截取错误信息,这里我们不处理,后续直接判定statecode属性来决定是否还要执行下面的程序。

    另外一点变化就是以前是通过将所获取的网页存到文本中去,这次

    WebRequest myRequest = WebRequest.Create("http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1466307565574_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=" + Uri.EscapeDataString(keyWord));
                HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
                if (myResponse.StatusCode == HttpStatusCode.OK)
                {
                    Stream strm = myResponse.GetResponseStream();
                    StreamReader sr = new StreamReader(strm);
                    string line = sr.ReadToEnd();

    将它全放入了string中。

    最后一点是去掉了DownloadPage这个方法,如上,它的功能可以放入按钮的单击事件中实现,没有必要把一件事做两遍。

    下面是前台页面:

    后台代码:

    using Newtonsoft.Json;
    using Newtonsoft.Json.Linq;
    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.IO;
    using System.Linq;
    using System.Net;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Threading.Tasks;
    using System.Windows.Forms;
    
    namespace 百度图片爬虫V2._1
    {
        public partial class Form1 : Form
        {
            public delegate void AsynFunction(string s,int i);
            public Form1()
            {
                InitializeComponent();
            }
            private static string[] getLinks(string html, out int counts)
            {
                const string pattern = @"http://([w-]+.)+[w-]+(/[w- ./?%&=]*)?";
                Regex r = new Regex(pattern, RegexOptions.IgnoreCase); //新建正则模式
                MatchCollection m = r.Matches(html); //获得匹配结果
                string[] links = new string[m.Count];
                int count = 0;
                for (int i = 0; i < m.Count; i++)
                {
                    if (isValiable(m[i].ToString()))
                    {
                        links[count] = m[i].ToString(); //提取出结果
                        count++;
                    }
    
                }
                counts = count;
                return links;
            }
            private void button1_Click(object sender, EventArgs e)
            {
                string keyWord = this.textBox1.Text;
                WebRequest myRequest = WebRequest.Create("http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1466307565574_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=" + Uri.EscapeDataString(keyWord));
                HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
                if (myResponse.StatusCode == HttpStatusCode.OK)
                {
                    Stream strm = myResponse.GetResponseStream();
                    StreamReader sr = new StreamReader(strm);
                    string line = sr.ReadToEnd();
                    int counts = 0;
                    string[] str = getLinks(line, out counts);
                    this.progressBar1.Maximum = counts;
                    for (int i = 0; i < counts; i++)
                    {
                        AsynFunction fun = new AsynFunction(savePicture);
                        fun.BeginInvoke(str[i],i, ar => {
                            fun.EndInvoke(ar);
                            this.progressBar1.BeginInvoke(new Action(() =>
                            {
                                this.progressBar1.Value =progressBar1.Maximum;
                            }));
                            this.textBox2.BeginInvoke(new Action(() =>
                            {
                                StringBuilder sb=new StringBuilder();
                                sb.Append(Environment.NewLine);
                              //  sb.Append(str[i].ToString());
                                sb.Append("下载结束");
                                this.textBox2.Text += sb.ToString();
                            }));
                        }, fun);
                    }
                }
            }
            private static bool isValiable(string url)
            {
                if (url.Contains(".jpg") || url.Contains(".gif") || url.Contains(".png"))
                {
                    return true; //得到一些图片之类的资源
                }
                return false;
            }
            public void savePicture(string path,int i)
            {
                if (path != "" && path != null)
                {
                    DataClasses1DataContext db = new DataClasses1DataContext();
                    Uri url = new Uri(path);
                    HttpWebRequest webRequest = (HttpWebRequest)HttpWebRequest.Create(url);
                    webRequest.Referer = "http://image.baidu.com";
                    webRequest.Timeout = 30000;
                    //设置连接超时时间 
                    webRequest.AllowAutoRedirect = true;
                    webRequest.Headers.Set("Pragma", "no-cache");
                    webRequest.UserAgent = "Mozilla-Firefox-Spider(Wenanry)";
                    HttpWebResponse webResponse;
                    try
                    {
                        webResponse = (HttpWebResponse)webRequest.GetResponse();
                    }
                    catch(WebException ex)
                    {
                        webResponse = (HttpWebResponse)ex.Response;
                    }
                 
                    if(webResponse!=null&&webResponse.StatusCode==HttpStatusCode.OK)
                    {
    
                        if (isValiable(path))//判断如果是图片,就将其存储到数据库中。
                        {
                            Bitmap myImage = new Bitmap(webResponse.GetResponseStream());
    
                            MemoryStream ms = new MemoryStream();
                            myImage.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);
                            var p = new pictureUrl
                            {
                                pictureUrl1 = ms.ToArray()
                            };
                            db.pictureUrl.InsertOnSubmit(p);
                            db.SubmitChanges();
                            this.progressBar1.BeginInvoke(new Action(() =>
                            {
                                this.progressBar1.Value = i;
                            }));
                            this.textBox2.BeginInvoke(new Action(() =>
                            {
                                StringBuilder sb1 = new StringBuilder();
                                sb1.Append(path);
                                sb1.Append("图片下载开始" + Environment.NewLine);
                                this.textBox2.Text += sb1.ToString();
                            }));                     
                        }
                    }
                }
            }
    
            private void button2_Click(object sender, EventArgs e)
            {
                this.Close();
            }
        }
    }
  • 相关阅读:
    how to use http.Agent in node.js
    How Node.js Multiprocess Load Balancing Works
    Child Process
    What does cmd /C mean? [closed] 关于nodejs的子进程部分
    Net
    二进制与十六进制的关系
    POJ 1201 Intervals (差分约束系统)
    POJ 1201 Intervals (差分约束系统)
    差分约束系统
    差分约束系统
  • 原文地址:https://www.cnblogs.com/JsonZhangAA/p/5616654.html
Copyright © 2011-2022 走看看