zoukankan      html  css  js  c++  java
  • 多线程带智能采集策略的采集系统

        去年年底的时候曾经发过一个数据采集器网页数据采集器,那是专门针对某一个网站来进行采集的,如果需要采集新的网站内容,就需要修改代码并重新编译。

        昨晚完成了一个带智能策略的采集系统。其实,这个策略的方案三年前就想好了,那时候打算用VB做,做了一半就搁置了。现在用C#才终于把这个方案实现了。

        整个方案大概是这样的:

        需要建立一个AC数据库,MSSQL也行,有四个表:PageType用于记录页面的种类,比如列表页和详细页两类;Url表用于记录要采集的网址,另外还有一个字段TypeID标明该网址属于哪一种页面类型,比如是列表页还是详细页;Rule表记录着各种规则,主要有三个字段,FromTypeID源页类型,ToTypeID目的页类型,Pattern规则;CjPage用于存储采集到的网页内容,还包含网址和页面种类。

        采集策略的核心就在于规则库Rule。

        工作过程大概这样:
    1,采集线程从Url表抽取一个网址,并马上在表中将其删除,为了防止冲突,这个过程需要用多线程同步解决;
    2,用WebClient请求该网址的页面内容;
    3,取得内容后,给线程池的线程来分析处理,本线程回到1,继续去Url表取下一个网址;
    4,线程池在有空闲线程时,会调用分析函数ParsePage去处理上次获得的页面内容;
    5,先到Rule中取所有FromTypeID为当前网址TypeID;
    6,如果没有取到任何规则Rule,则将本页内容写入到CjPage中;
    7,如果取到规则,那么遍历规则,为每条规则执行ParseUrl方法;
    8,ParseUrl根据规则的Pattern匹配到页面内容中的所有网址,并记录到Url中,规则的ToTypeID就是Url的TypeID。

        至此,整个流程就完成了。下面举一个实际例子来说明一下:
        我要截取动网开发者网络的所有ASP文章http://www.cndw.com/tech/asp/
        首先,在页面类型库中加入列表页和详细页两行,再把http://www.cndw.com/tech/asp/写入到Url中,页面类型是列表页;
        其次,在Rule中加入两条规则:
            一,从列表页取得详细页的网址FromTypeID=1  ToTypeID=2,Pattern是· <a href="([^>]*)" target=_blank>,这条规则将会识别列表页上的所有详细页的链接,并记入到Url中,TypeID是详细页;
            二,从列表页取得列表页的网址FromTypeID=1  ToTypeID=1,Pattern是<a href='([^>]*)'>下一页<\/a>,这条规则将会取得当前列表页上的下一页的链接,并记入到Url中,TypeID还是列表页。
        采集器工作时,如果采集的是详细页的内容,将会直接写入到CjPage中,因为没有FromTypeID=2的规则;而采集的是列表页的内容时,就要做两件事了,因为有两条FromTypeID=1的规则,一件事是识别当前列表页中所有文章的链接并存入Url,另一件事是识别下一列表页链接并存入Url。
        由于规则具有递归性,使得采集器能递归采集到所有的文章。

        下面是一些核心源码(没有公开的都是一些数据层的添删改查的代码):

    以下是代码片段:

    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.Net;
    using System.Threading;
    using CJData;
    using System.Text.RegularExpressions;
    using NLog;

    namespace CJ
    {
        /// <summary>
        /// 写日志委托
        /// </summary>
        /// <param name="log"></param>
        public delegate void WriteLogCallBack(String log);
        /// <summary>
        /// 采集
        /// </summary>
        public class CaiJi
        {
            private WebClient _wc;

            public WebClient Wc
            {
                get
                {
                    if (_wc == null) _wc = new WebClient();
                    return _wc;
                }
            }
            private Thread thread;

            public String Name = "";
            public event WriteLogCallBack OnWriteLog;

            /// <summary>
            /// 开始工作
            /// </summary>
            public void Start()
            {
                if (thread != null) return;
                thread = new Thread(new ThreadStart(Work));
                thread.Start();
            }
            /// <summary>
            /// 停止工作
            /// </summary>
            public void Stop()
            {
                if (thread != null) thread.Abort();
                thread = null;
            }

            private void Work()
            {
                int times = 0;
                while (times < 100)
                {
                    Url url = Url.SelectOne();
                    try
                    {
                        if (url != null)
                        {
                            String page = Wc.DownloadString(url.UrlAddress);
                            if (!String.IsNullOrEmpty(page))
                            {
                                OnWriteLog(Name + " 成功抓取:" + url.UrlAddress);
                                times = 0;
                                ThreadPool.QueueUserWorkItem(new WaitCallback(ParsePage), new Object[] { url, page });
                            }
                        }
                        else
                        {
                            //OnWriteLog(Name + " 没有工作,休息半秒");
                            times++;
                            //没有工作,休息半秒
                            Thread.Sleep(500);
                        }
                    }
                    catch (ThreadAbortException e)
                    {
                        OnWriteLog(Name + " 外部终止");
                        break;
                    }
                    catch (Exception e)
                    {
                        times++;
                        OnWriteLog(Name + " 赚取" + url.UrlAddress + "出错,休息半秒。" + e.Message);
                        Trace.WriteLine(url.UrlAddress);
                        //出错,休息半秒
                        Thread.Sleep(500);
                    }
                }
                OnWriteLog(Name + " 完成!");
            }

            private void ParsePage(Object state)
            {
                Object[] objs = (Object[])state;
                Url url = objs[0] as Url;
                String page = (String)objs[1];
                IList<Rule> rs = Rule.SelectAll(Rule._.FromTypeID, url.TypeID);
                //if (url.PageType.TypeName == "详细页")
                if (rs == null || rs.Count < 1)
                {
                    CjPage cp = new CjPage();
                    cp.CjTime = DateTime.Now;
                    cp.Content = page;
                    cp.Url = url.UrlAddress;
                    cp.TypeID = url.TypeID;
                    cp.Insert();
                }
                else
                {
                    foreach (Rule r in rs)
                    {
                        ParseUrl(url, r, page);
                    }
                }
            }
            private void ParseUrl(Url u, Rule r, String page)
            {
                Regex reg = new Regex(r.Pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);
                MatchCollection ms = reg.Matches(page);
                foreach (Match m in ms)
                {
                    Url url = new Url();
                    url.TypeID = r.ToTypeID;
                    url.UrlAddress = m.Groups[1].Value;
                    if (!url.UrlAddress.StartsWith("http://"))
                    {
                        if (url.UrlAddress.Substring(0, 1) == "/")
                        {
                            url.UrlAddress = u.UrlAddress.Substring(0, u.UrlAddress.IndexOf("/", 8)) + url.UrlAddress;
                        }
                        else
                        {
                            if (u.UrlAddress.Substring(u.UrlAddress.Length - 1) == "/")
                                url.UrlAddress = u.UrlAddress + url.UrlAddress;
                            else
                                if (u.UrlAddress.LastIndexOf("/") < u.UrlAddress.LastIndexOf("."))
                                    url.UrlAddress = u.UrlAddress.Substring(0, u.UrlAddress.LastIndexOf("/") + 1) + url.UrlAddress;
                                else
                                    url.UrlAddress = u.UrlAddress + "/" + url.UrlAddress;
                        }
                    }
                    url.Insert();
                }
            }
        }
    }

    以下是代码片段:

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Text;
    using System.Windows.Forms;
    using System.Net;

    namespace CJ
    {
        public partial class Form1 : Form
        {
            public Form1()
            {
                InitializeComponent();
            }

            CaiJi[] cjs;
            private void button1_Click(object sender, EventArgs e)
            {
                Button btn = sender as Button;
                if (btn.Text == "停止")
                {
                    foreach (CaiJi cj in cjs)
                    {
                        if (cj != null) cj.Stop();
                    }
                    cjs = null;
                    btn.Text = "开始";
                    return;
                }

                richTextBox1.Text = "";
                btn.Text = "停止";

                int k = 100;
                if (!int.TryParse(textBox1.Text, out k)) k = 100;
                cjs = new CaiJi[k];
                for (int i = 0; i < cjs.Length; i++)
                {
                    cjs[i] = new CaiJi();
                    cjs[i].Name = "线程" + i.ToString("00");
                    cjs[i].OnWriteLog += new WriteLogCallBack(cj_OnWriteLog);
                }
                foreach (CaiJi cj in cjs)
                {
                    cj.Start();
                }
            }

            void cj_OnWriteLog(string log)
            {
                if (richTextBox1.InvokeRequired)
                {
                    richTextBox1.Invoke(new WriteLogCallBack(cj_OnWriteLog), new object[] { log });
                }
                else
                {
                    if (richTextBox1.Lines.Length > 3000) richTextBox1.Text = "";
                    richTextBox1.Text = log + Environment.NewLine + richTextBox1.Text;
                }
            }
        }
    }


  • 相关阅读:
    python目录
    面向对象
    模块(二)
    python函数(四)
    助教工作总结
    第五次个人作业:个人总结
    Typroa编写的图片上传博客园
    msfconsole利用ms17-010和ms12-020攻击
    第四次个人作业——案例分析
    助教周报(第二轮)
  • 原文地址:https://www.cnblogs.com/nnhy/p/860656.html
Copyright © 2011-2022 走看看