zoukankan      html  css  js  c++  java
  • 网络爬虫(抓取)正则表达式 (多线程协作)

    1.多线程调用界面后台代码
    
    
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Web;
    using System.Web.UI;
    using System.Web.UI.WebControls;
    using HraWeb.Common;
    using WebApp.Common;
    using Contract.Domain;
    using System.Collections;
    using System.IO;
    using System.Net;
    using System.Runtime.Serialization.Json;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Threading;
    using Common;
    using Elmah;
    using Framework;
    using ThreadTemplate;
    using System.Diagnostics;
    
    namespace HraWeb
    {
    
        public partial class CcrCompanyManage : JQEntityManage<Contract.Domain.CcrCompanyFundamental>
        {
            private Spring.Caching.ICache cache;
            private Spring.Caching.ICache SpringCache
            {
                get
                {
                    if (cache == null)
                        cache = (Spring.Caching.ICache)ctx.GetObject("AspNetCache");
                    return cache;
                }
                set { cache = value; }
            }
          
            public static Dictionary<int, IList<CcrCompanyFundamental>> dic = new Dictionary<int, IList<CcrCompanyFundamental>>();
            private void ConstractionPrePageIndexData(int pageIndex, int pageSize, IList<CcrCompanyFundamental> funmentals)
            {
                int start = (pageIndex - 1) * pageSize + 1;
                int end = start + pageSize - 1;
                for (int i = start; i <= end; i++)
                {
                    if (!dic.ContainsKey(pageIndex))
                    {
                        dic.Add(pageIndex, new List<CcrCompanyFundamental>() { });
    
                    }
                    dic[pageIndex].Add(funmentals[i - 1]);
    
                }
    
     
    
            }
    
            private ArrayList GetRatingInfo()
            {
                //Dao2 = GetDao();
                IList<CcrCreditScoreInfo> temps = Holworth.Utility.HraUtility.ListToT<CcrCreditScoreInfo>(Dao.FindList(new QueryInfo("CcrCreditScoreInfo")));
                List<int?> fids = temps.Select(x => x.FundamentalId).ToList();
                ArrayList ratingClassPaimingStates = new ArrayList();
                QueryInfo finfo = new QueryInfo("CcrCompanyFundamental");
                WebClient wc = new WebClient();
                IList<CcrCompanyFundamental> funmentals = Holworth.Utility.HraUtility.ListToT<CcrCompanyFundamental>(Dao.FindList(finfo));
                funmentals = (from f in funmentals where !fids.Contains(int.Parse(f.Id)) select f).ToList();
                int pageIndex = 1;
                int pageSize = 1;
                int totalCount = funmentals.Count;//总记录数,亦总开线程数
                int totalPage = (totalCount - 1) / pageSize + 1;//
                //处理前面的n-1页的数据
                for (pageIndex = 1; pageIndex < totalPage; pageIndex++)
                {
                    ConstractionPrePageIndexData(pageIndex, pageSize, funmentals);
    
    
                }
                //处理最后一页的数据
                int LastFirstOne = (pageIndex - 1) * pageSize + 1;
    
                for (int j = LastFirstOne; j <= totalCount; j++)
                {
                    if (!dic.ContainsKey(pageIndex))
                    {
                        dic.Add(pageIndex, new List<CcrCompanyFundamental>());
                    }
    
                    dic[pageIndex].Add(funmentals[j - 1]);
                }
                //多线程的最大并发数
                int maxPoolThread = 100;
                int totalThreadNum = dic.Count;
                //当前正在运行的线程
                var runingHt = new Dictionary<int, clsSubThread>();
                //处于等待队列的未运行的线程
                var unRunHt = new Dictionary<int, clsSubThread>();
        
                //选取maxPoolThread个线程加入运行队列,其余放入未运行的等待队列 
                for (int i = 1; i <= totalThreadNum; i++)
                {
                    clsSubThread th = new clsSubThread(i, dic[i]);
                 
                    if (i <= maxPoolThread)
                    {
                        runingHt.Add(i,th);
                        th.Start();
                    }
                    else
                    {
                        unRunHt.Add(i, th);
                    }
                }
               
                
                while (true)
                {
                    //初始化完成队列,用于存取已经执行完的线程的id
                    var stepFinishList = new List<int>();
                    
                   //将完成的线程放入完成队列
                    foreach (int tid in runingHt.Keys)
                    {
                        var t = runingHt[tid];
                        if (t.IsStopped)
                        {
                            stepFinishList.Add(tid);
                        }
                    }
                  //1.遍历完成队列,从当前运行的线程队列中移除该线程
                  //2.对完成的线程执行回调,将数据持久化到数据库
                  //3.如果等待队列中还有数据,获取等待队列中的第一个,并执行该线程,将该线程从等待队列移除,加入到运行队列
    
                    foreach (int tid in stepFinishList)
                    {
                        Thread t1 = new Thread(new ParameterizedThreadStart(SaveOrUpdate));
                        t1.Start(runingHt[tid].ReturnList);
                        runingHt.Remove(tid);
    
                        if (unRunHt.Count > 0)
                        {
                            clsSubThread unRunThread = unRunHt.First().Value;
                            var unRunTid= unRunHt.First().Key;
                            unRunThread.Start();
                            runingHt.Add(unRunTid, unRunThread);
                            unRunHt.Remove(unRunTid);
                        }
                    }
    
                    //所有线程都完成后,跳出循环
                    if (runingHt.Count == 0 && unRunHt.Count == 0)
                    {
                        break;
                    }
    
    
                }
    
              
                return ratingClassPaimingStates;
    
            }
    
           ///线程完成之后的回调动作,将返回的List保存到数据库
            private void SaveOrUpdate(object o)
            {
                IList list = (IList)o;
                Dao.SaveOrUpdateAll(list);
            }
    
         
       
    
            protected override void Page_Load(object sender, EventArgs e)
            {
              
                base.Page_Load(sender, e);
            }
    
    
        
            ///click事件抓取网页信息,通过多线程协作插入到数据表
            protected void Button1_Click(object sender, EventArgs e)
            {
                GetRatingInfo();
            }
    
            
        }
    }
    
    2.封装的线程类
    
    
    using System.Collections.Generic;
    using System.Net;
    using System.Text;
    using System.Text.RegularExpressions;
    using Contract.Domain;
    using System.Web;
    namespace ThreadTemplate
    {
        using System;
        using System.Threading;
        using System.IO;
        /// <summary>
        /// Summary description for clsSubThread.
        /// </summary>
        public class clsSubThread : IDisposable
        {
            private Thread thdSubThread = null;
            private Mutex mUnique = new Mutex();
    
            private bool blnIsStopped;
            private bool blnSuspended;
            private bool blnStarted;
            private IList<CcrCompanyFundamental> clist;
            private int threadId;
    
            public int ThreadId { get { return threadId; } set { threadId = value; } }
            private IList<CcrCreditScoreInfo> paiming; 
            public bool IsStopped
            {
                get { return blnIsStopped; }
            }
            public bool IsSuspended
            {
                get { return blnSuspended; }
            }
            public IList<CcrCreditScoreInfo> ReturnList
            {
                get { return paiming; }
             
            }
    
            public IList<CcrCompanyFundamental> CList
            {
                set { clist = value; }
            }
    
    
            public clsSubThread(int key,IList<CcrCompanyFundamental> pclist)
            {
                threadId = key;
                paiming=new List<CcrCreditScoreInfo>();
                //
                // TODO: Add constructor logic here
                //
                blnIsStopped = true;
                blnSuspended = false;
                blnStarted = false;
    
              
                clist = pclist;
            }
    
            /// <summary>
            /// Start sub-thread
            /// </summary>
            public void Start()
            {
                if (!blnStarted)
                {
                    thdSubThread = new Thread(new ThreadStart(SubThread));
                    blnIsStopped = false;
                    blnStarted = true;
                    thdSubThread.Start();
                }
            }
    
            /// <summary>
            /// Thread entry function 线程执行方法,从网站中用正则表达式,抓取需要的数据
            /// </summary>
            private void SubThread()
            {
                paiming = new List<CcrCreditScoreInfo>();
                
                  WebClient wc=new WebClient();
               // do
                {
                  
                    for (int i = 0; i < clist.Count; i++)
                    {
                        CcrCompanyFundamental company = clist[i];
                        CcrCreditScoreInfo c = new CcrCreditScoreInfo();
                        c.FundamentalId = int.Parse(company.Id);
                        c.CompanyName = company.CompanyName;
    
                        string keyword = "湖南艾华集团股份有限公司";
                        using (Stream stream = wc.OpenRead("http://bgcheck.cn/MemberCenter/FirmCredit/Search.html?Keywords=" + keyword))
                        {
                            using (StreamReader sr = new StreamReader(stream, Encoding.UTF8))
                            {
                                string content = sr.ReadToEnd();
                                string ratingClasspatern = @"(?<=[信用等级:([sS]*?)<a(.*)?[^>]*?>)([sS]*?)(?=</a>)";
                                string ratingSequencepatern = @"(?<=信用排名:([sS]*?)<span(.*)?[^>]*?>)([sS]*?)(?=</span>)";
                                string ratingStatepatern = @"(?<=信用状况:([sS]*?)<span(.*)?[^>]*?>)([sS]*?)(?=</span>)";
                                MatchCollection ratingClassmatches = Regex.Matches(content, ratingClasspatern);
                                MatchCollection ratingSequencematches = Regex.Matches(content, ratingSequencepatern);
                                MatchCollection ratingStatematches = Regex.Matches(content, ratingStatepatern);
                                string ratingClass = string.Empty;
                                string ratingSequence = string.Empty;
                                string ratingState = string.Empty;
                                foreach (Match match in ratingClassmatches)
                                {
                                    ratingClass = match.Groups[0].Value;
                                    break;
                                }
                                foreach (Match match in ratingSequencematches)
                                {
                                    ratingSequence = match.Groups[0].Value;
                                    break;
                                }
                                foreach (Match match in ratingStatematches)
                                {
                                    ratingState = match.Groups[0].Value;
                                    break;
                                }
                                c.RatingClass = ratingClass;
                                c.RatingSequence = ratingSequence;
                                c.RatingState = ratingState;
                                paiming.Add(c);
                            }
                        }
                    }
    
                    // Release CPU here
                }
                 this.Stop();
                //while (blnIsStopped == false);
            }
    
            /// <summary>
            /// Suspend sub-thread
            /// </summary>
            public void Suspend()
            {
                if (blnStarted && !blnSuspended)
                {
                    blnSuspended = true;
                    mUnique.WaitOne();
                }
            }
    
            /// <summary>
            /// Resume sub-thread
            /// </summary>
            public void Resume()
            {
                if (blnStarted && blnSuspended)
                { 
                    blnSuspended = false;
                    mUnique.ReleaseMutex();
                }
            }
    
            /// <summary>
            /// Stop sub-thread
            /// </summary>
            public void Stop()
            {
                if (blnStarted)
                {
                    if (blnSuspended)
                        Resume();
    
                    blnStarted = false;
                    blnIsStopped = true;
                    thdSubThread.Join();
                }
            }
            #region IDisposable Members
            /// <summary>
            /// Class resources dispose here
            /// </summary>
            public void Dispose()
            {
                // TODO:  Add clsSubThread.Dispose implementation
                Stop();//Stop thread first
                GC.SuppressFinalize(this);
            }
    
            #endregion
        }
    }

     

  • 相关阅读:
    医学影像分割之HIP
    c++画分形之Julia集与Mandelbrot集
    趣题一道
    华山论剑常用角点检测与角点匹配方法比较
    改变鼠标样式
    Unity3D Pro 利用摄像头产生俯视地图效果
    unity3D小地图教程
    WebBrowser网址中特殊字符的问题
    打开多个unity3D项目 (项目多开)
    u3d按住鼠标右键才转动摄像机的方法
  • 原文地址:https://www.cnblogs.com/kexb/p/5115233.html
Copyright © 2011-2022 走看看