zoukankan      html  css  js  c++  java
  • CCrawl

     1 #ifndef _Crawl_H_031104_
     2 #define _Crawl_H_031104_
     3 
     4 //#include <openssl/md5.h>
     5 #include <zlib.h>
     6 
     7 #include "Tse.h"
     8 #include "Http.h"
     9 #include "StrFun.h"
    10 #include "Url.h"
    11 #include "Page.h"
    12 #include "TianwangFile.h"
    13 #include "IsamFile.h"
    14 #include "Link4SEFile.h"
    15 
    16 using namespace std;
    17 
    18 class CCrawl
    19 {
    20 public:
    21     string m_sInputFileName;    //种子URL的文件名字: tse_seed.pku
    22     string m_sOutputFileName;    //保存我们已经访问过的URL的文件名字: visited.all
    23 
    24     CIsamFile m_isamFile;        // ISAM file handle
    25 
    26     ofstream m_ofsVisitedUrlFile;    //visited.all的文件句柄
    27     ofstream m_ofsLink4SEFile;    //link4SE.url的文件句柄
    28     ofstream m_ofsLink4HistoryFile;    //link4History.url的文件句柄
    29     ofstream m_ofsUnreachHostFile;    //tse_unreachHost.list的文件句柄
    30 
    31     ofstream m_ofsVisitedUrlMD5File;//tse_md5.visitedurl的文件句柄
    32     ofstream m_ofsVisitedPageMD5File;//tse_md5.visitedpage
    33 
    34     ofstream m_ofsUnreachUrlFile;    // unreach URL file handle
    35 
    36 
    37 public:
    38     CCrawl();//无参构造函数  "tse_seed.pku" "visited.all"
    39     CCrawl(string strInputFile, string strOutputFile);
    40     ~CCrawl();
    41 
    42     //CCrawl类中最重要的函数
    43     void DoCrawl();
    44 
    45     //根据URL以及套接字文件描述符抓取URL对应的网页
    46     void DownloadFile( CTianwangFile *pTianwangFile,CLink4SEFile *pLink4SEFile, CUrl iUrl, int& nGSock);
    47 
    48     //每个线程函数start()都调用这个函数
    49     void fetch(void *arg);
    50 
    51     //如果url满足条件加到mmapUrls[待访问的url]容器中
    52     void AddUrl(const char *url);
    53 
    54     void GetVisitedUrlMD5();//得到已经访问过的URL对应的MD5值,放入open list[setVisitedUrlMD5]中
    55     void GetVisitedPageMD5();//得到已经访问过的web网页体对应的MD5值,放入setVisitedPageMD5中
    56 
    57     void GetIpBlock();//得到阻塞的IP,放入mapIpBlock容器中
    58 
    59     void GetUnreachHostMD5();//得到不可到达的主机号,放入setUnreachHostMD5中
    60     void OpenFilesForOutput();//打开所有的输出流
    61 
    62     // save in the process
    63     void SaveTianwangRawData(CTianwangFile *pTianwangFile,CUrl *pUrl, CPage *pPage);//将抓取的网页以天网格式存储
    64     void SaveLink4SERawData(CLink4SEFile *pLink4SEFile,CUrl *pUrl, CPage *pPage);//将抓取的网页从中提取超链接信息建立网页结构库
    65 
    66     void SaveIsamRawData(CUrl *pUrl, CPage *Page);
    67     void SaveVisitedUrl(string url);//保存已经访问过的URL
    68     void SaveUnreachHost(string host);//保存不可到达的主机号
    69     void SaveLink4SE(CPage *Page);//保存为搜索引擎准备的超链接信息
    70     bool SaveLink4SE031121(void *arg);
    71     void SaveLink4History(CPage *Page)//保存为历史网页存档准备的超链接信息
    72 
    73     // save while the program running
    74     void SaveVisitedUrlMD5(string md5);//保存已经访问过的URL对应的MD5值
    75     void SaveVisitedPageMD5(string md5);//得到已经访问过的web网页体对应的MD5值
    76 
    77 };
    78 
    79 #endif /* _CRAWL_H_031104_ */
       1 #include "Crawl.h"
       2 #include "Url.h"
       3 #include "Md5.h"
       4 
       5 #include <list.h>
       6 #include <hlink.h>
       7 #include <uri.h>
       8 
       9 extern pthread_mutex_t mymutex;
      10 extern map<string, string> mapCacheHostLookup; //DNS缓存
      11 extern vector<string> vsUnreachHost;
      12 extern char **ParseRobot(char *data, char len);
      13 
      14 set<string> setVisitedUrlMD5; //open list[已经访问的URL对应的MD5值]
      15 set<string> setVisitedPageMD5; //已经访问过的web网页体对应的MD5值
      16 set<string> setUnvisitedUrlMD5; //close list[没有访问过的URL对应的MD5值]
      17 
      18 set<string> setUnreachHostMD5; //不可达到的主机号对应的MD5值的集合
      19 
      20 multimap<string, string, less<string> > replicas; //web网页体对应的MD5值<->web网页体对应的URL
      21 
      22 //定义线程的互斥变量并初始化
      23 pthread_mutex_t mutexCollection = PTHREAD_MUTEX_INITIALIZER; //保护mmapUrls资源
      24 pthread_mutex_t mutexUnreachHost = PTHREAD_MUTEX_INITIALIZER; //保护setUnreachHostMD5&&m_ofsUnreachHostFile资源
      25 pthread_mutex_t mutexUnvisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER; //保护setUnvisitedUrlMD5资源
      26 pthread_mutex_t mutexVisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER; //保护setVisiteUrlMD5&&m_ofsVisiteUrlMD5File资源
      27 pthread_mutex_t mutexVisitedPageMD5 = PTHREAD_MUTEX_INITIALIZER; //保护setVisitePageMD5&&m_ofsVisitePageMD5File资源
      28 
      29 pthread_mutex_t mutexDetect = PTHREAD_MUTEX_INITIALIZER;
      30 pthread_mutex_t mutexLink4SEFile = PTHREAD_MUTEX_INITIALIZER; //保护m_ofsLink4SEFile资源
      31 pthread_mutex_t mutexLink4HistoryFile = PTHREAD_MUTEX_INITIALIZER;
      32 pthread_mutex_t mutexIsamFile = PTHREAD_MUTEX_INITIALIZER;
      33 pthread_mutex_t mutexVisitedUrlFile = PTHREAD_MUTEX_INITIALIZER;
      34 pthread_mutex_t mutexUnreachHostFile = PTHREAD_MUTEX_INITIALIZER;
      35 pthread_mutex_t mutexReplicas = PTHREAD_MUTEX_INITIALIZER;
      36 //pthread_mutex_t mutexMemory = PTHREAD_MUTEX_INITIALIZER;
      37 
      38 map<unsigned long, unsigned long> mapIpBlock; //IP阻塞范围
      39 bool b_fOver; //线程运行控制参数
      40 //multimap<string,string, less<string> > mmapUrls;
      41 multimap<string, string> mmapUrls; //保存没有访问过的URL的主机号<->对应的URL
      42 
      43 typedef map<unsigned long, unsigned long>::value_type valTypeIpBlock;
      44 typedef map<string, string>::value_type mvalType;
      45 
      46 void SaveReplicas(const char* filename); //保存镜像网页对应的URL的一个值到指定的文件名中
      47 
      48 struct package {
      49     CCrawl *crawl;
      50     CPage *page;
      51 };
      52 
      53 vector<string> vsParsedLinks;
      54 
      55 int onfind(const char *elem, const char *attr, struct uri *uri, void *arg) {
      56     struct package *p = (struct package*) arg;
      57     char buff[URL_LEN + 1];
      58 
      59     if (uri_recombine(uri, buff, URL_LEN + 1,
      60             C_SCHEME | C_AUTHORITY | C_PATH | C_QUERY) >= 0)
      61 
      62     {
      63         vsParsedLinks.push_back(buff);
      64         if (!p->page->IsFilterLink(buff)) {
      65             // accept "a,link,frame,iframe,img,area"
      66 
      67             if (strcasecmp(elem, "img") == 0) {
      68                 pthread_mutex_lock(&mutexLink4HistoryFile);
      69                 if (p->crawl->m_ofsLink4HistoryFile) {
      70                     p->crawl->m_ofsLink4HistoryFile << buff << endl;
      71                 }
      72                 pthread_mutex_unlock(&mutexLink4HistoryFile);
      73 
      74             } else {
      75                 p->crawl->AddUrl(buff);
      76             }
      77             /*
      78              else if (strcasecmp(elem, "img") == 0)
      79              {
      80              pthread_mutex_lock(&mutexLink4HistoryFile);
      81              if( p->crawl->m_ofsLink4HistoryFile ){
      82              p->crawl->m_ofsLink4HistoryFile << p->page->m_sUrl << endl;;
      83              }
      84              pthread_mutex_unlock(&mutexLink4HistoryFile);
      85              }
      86              */
      87         }
      88     }
      89 
      90     uri_destroy(uri);
      91     free(uri);
      92     return 1;
      93 }
      94 
      95 /***********************************************************************
      96  * Function name: start
      97  * Input argv:
      98  *     -- arg: the CCrawl handle
      99  * Output argv:
     100  *     --
     101  * Return:
     102  ***********************************************************************/
     103 //线程函数-->每个线程函数调用fetch(void*arg)函数
     104 void* start(void *arg) {
     105     ((CCrawl*) arg)->fetch(arg);
     106 }
     107 
     108 /*
     109  这个函数设计的很巧妙,这里说的巧妙不是函数写有多hi:
     110  我们知道spider在最开始抓取网页的时候需要种子url,
     111  我们这个spider的种子url文件库是tse_seed.ur文件
     112  而这个函数正好在我们强制中断程序的时候,将mmapUrls
     113  中没有访问完的url放入tse_unvisited.url文件中,
     114  扩充了我们的种子URL库!*/
     115 void SaveUnvisitedUrl() {
     116     ofstream ofsUnvisitedUrl;
     117     ofsUnvisitedUrl.open(UNVISITED_FILE.c_str(),
     118             ios::in | ios::out | ios::trunc | ios::binary); //以二进制可追加写方式打开文件
     119     if (!ofsUnvisitedUrl) //打开失败
     120     {
     121         cerr << "cannot open " << UNVISITED_FILE << "for output" << endl;
     122         exit(-1);
     123     }
     124 
     125     //将mmapUrls中没有访问完的url放入tse_unvisited.url文件中,扩充了我们的URL种子库!
     126     multimap<string, string>::iterator it = mmapUrls.begin();
     127     for (; it != mmapUrls.end(); it++) {
     128         ofsUnvisitedUrl << ((*it).second).c_str() << "\n";
     129     }
     130 
     131     ofsUnvisitedUrl << endl;
     132     ofsUnvisitedUrl.close();
     133 
     134 }
     135 
     136 /***********************************************************************
     137  * Function name: fetch
     138  * Input argv:
     139  *     -- arg: the CCrawl handle
     140  * Output argv:
     141  *     --
     142  * Return:
     143  ***********************************************************************/
     144 void CCrawl::fetch(void *arg) //每个线程都执行这个函数
     145         {
     146     string strUrl, host;
     147 
     148     int nGSock = -1; //之前的套接字文件描述符
     149     string strGHost = ""; //字前的主机号
     150 
     151     // create a Tianwang file for output the raw page data
     152     string ofsName = DATA_TIANWANG_FILE + "." + CStrFun::itos(pthread_self()); //Tianwang.raw+"线程号"
     153     CTianwangFile tianwangFile(ofsName); //创建一个天网格式的文件,保存为原始网页库
     154 
     155     // create a Link4SE file for output the raw link data
     156     ofsName = DATA_LINK4SE_FILE + "." + CStrFun::itos(pthread_self()); //Link4SE.raw+"线程号"
     157     CLink4SEFile link4SEFile(ofsName); //创建一个网页结构库
     158 
     159     int iSleepCnt = 0; //线程运行控制参数
     160     for (;;) {
     161         pthread_mutex_lock(&mutexCollection); //互斥的锁定函数
     162         //if( !mmapUrls.empty() ){
     163         int cnt = mmapUrls.size();
     164         if (cnt > 0) {
     165             //已经收集的没有访问的url
     166             cout << "collection has: " << cnt << " unvisited urls" << endl;
     167             multimap<string, string>::iterator it = mmapUrls.begin();
     168             if (it != mmapUrls.end()) {
     169                 // get an URL
     170                 strUrl = (*it).second; //从待访问的URL队列中得到一个URL进行访问
     171 
     172                 // remove it from the collection
     173                 mmapUrls.erase(it); //删除迭代器所指的元素
     174 
     175                 pthread_mutex_unlock(&mutexCollection); //互斥的解锁函数
     176 
     177                 // parse URL
     178                 CUrl iUrl; //关键是看看strUrl是否有http://协议号,没有返回false
     179                 if (iUrl.ParseUrlEx(strUrl) == false) {
     180                     cout << "ParseUrlEx error in fetch(): " << strUrl << endl;
     181                     continue;
     182                 }
     183 
     184                 //表明现在抓取的网页所在的主机,同之前抓取的网页所在的主机不同
     185                 //故,我们不能利用之前的套接字文件描述符进行CS通信,必须创建新的
     186                 //套接字文件描述符进行通信,这是由于循环导致的
     187                 if (strGHost != iUrl.m_sHost) {
     188                     close(nGSock);
     189                     nGSock = -1;
     190                     strGHost = iUrl.m_sHost;
     191                 }
     192 
     193                 //根据URL以及套接字文件描述符抓取URL对应的网页,并保存为原始网页库和网页结构库
     194                 ((CCrawl*) arg)->DownloadFile(&tianwangFile, &link4SEFile, iUrl,
     195                         nGSock);
     196 
     197                 cnt = 0;
     198             } else {
     199                 pthread_mutex_unlock(&mutexCollection);
     200             }
     201         } else {
     202             //待访问的URL队列mmapUrls中没有URL了,这个时候我们必须挂起线程进行等待
     203             pthread_mutex_unlock(&mutexCollection);
     204             usleep(1000);
     205             iSleepCnt++;
     206         }
     207 
     208         if (b_fOver == true && iSleepCnt == 200) //当URL队列mmapUrls有200次都是空的时候就结束这个线程调用的fetch()函数
     209             break;
     210         /*
     211          if( b_fOver == true ){
     212          break;
     213          } else if( cnt == 100 ) {
     214          cout << "w.";
     215          cnt = 0;
     216          }
     217          */
     218     }
     219 
     220     tianwangFile.Close();
     221     link4SEFile.Close();
     222 }
     223 
     224 /***********************************************************************
     225  * Function name: DownloadFile
     226  * Input argv:
     227  *     -- pTianwang: the CCrawl handle
     228  *     -- pLink4SE: the CCrawl handle
     229  *     -- iUrl: the URL for crawling
     230  *     -- nGSock: the previous global socket
     231  * Output argv:
     232  *     --
     233  * Return:
     234  ***********************************************************************/
     235 
     236 //根据URL以及套接字文件描述符抓取URL对应的网页,并保存为原始网页库和网页结构库
     237 void CCrawl::DownloadFile(CTianwangFile *pTianwangFile,
     238         CLink4SEFile *pLink4SEFile, CUrl iUrl, int& nGSock) {
     239     char *downloaded_file = NULL, //网页体信息
     240             *fileHead = NULL, //网页头信息
     241             *location = NULL; //网页的重定向信息
     242     int file_length = 0; //网页体真实的字节长度
     243     string strUrlLocation = ""; //保存网页的重定向超链接
     244 
     245     //之后请求的网页和之前请求的网页位于同一个主机上,我们可以利用之前的套接字文件描述符进行通信,这样我们可以节约带宽,节省时间
     246     int nSock = nGSock; //将之前的套接字文件描述符赋值给nSock
     247 
     248     cout << "1. pid=" << pthread_self() << " sock = " << nGSock << endl;
     249 
     250     CHttp http;
     251 
     252     //这是一个真正的抓取网页的函数,有了URL搜集系统可以根据URL的标识抓取其对应的网页
     253     file_length = http.Fetch(iUrl.m_sUrl, &downloaded_file, &fileHead,
     254             &location, &nSock);
     255 
     256     int nCount = 0; //用来标识URL重定向的次数,如果重定向了3次,我们就不要抓取它对应的网页
     257 
     258     while (file_length == -300) //表明该iUrl.m_sUrl对应的网页重定向了
     259     { // moved to an another place
     260         if (strlen(location) > URL_LEN - 1 || nCount == 3
     261                 || strlen(location) == 0) {
     262             if (location) {
     263                 //pthread_mutex_lock(&mutexMemory); 
     264                 free(location);
     265                 location = NULL;
     266                 //pthread_mutex_unlock(&mutexMemory);
     267             }
     268             file_length = -1;
     269             break;
     270         }
     271 
     272         //将获取到的重定向的URL给strUrlLocation为下次抓取网页做准备
     273         strUrlLocation = location;
     274         if (location) {
     275             //pthread_mutex_lock(&mutexMemory);
     276             free(location);
     277             location = NULL;
     278             //pthread_mutex_unlock(&mutexMemory);
     279         }
     280 
     281         //这个地方要注意,因为重定向的URL可能是相对路径,所以我们必须将它转化为绝对路径
     282         //跟CPage类中提取超链接信息一样
     283         string::size_type idx1 = CStrFun::FindCase(strUrlLocation, "http");
     284 
     285         if (idx1 != 0) { //没有找"http://"协议号
     286 
     287             char c1 = iUrl.m_sUrl.at(iUrl.m_sUrl.length() - 1);
     288             char c2 = strUrlLocation.at(0);
     289 
     290             if (c2 == '/') //重定向的URL一定是相对路径
     291                     {
     292                 strUrlLocation = "http://" + iUrl.m_sHost + strUrlLocation;
     293             } else if (c1 != '/' && c2 != '/') {
     294                 string::size_type idx;
     295 
     296                 idx = iUrl.m_sUrl.rfind('/');
     297                 if (idx != string::npos) {
     298                     if (idx > 6) { // > strlen("http://..")
     299                         strUrlLocation = iUrl.m_sUrl.substr(0, idx + 1)
     300                                 + strUrlLocation;
     301                     } else {
     302                         strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation;
     303                     }
     304 
     305                 } else {
     306                     file_length = -1;
     307                     break;
     308                 }
     309             } else {
     310                 if (c1 == '/') {
     311                     strUrlLocation = iUrl.m_sUrl + strUrlLocation;
     312                 } else {
     313                     strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation;
     314                 }
     315             }
     316         }
     317 
     318         CPage iPage;
     319         if (iPage.IsFilterLink(strUrlLocation)) { //如果得到的重定向URL是要过滤的URL,我们立刻结束不再抓取
     320             file_length = -1;
     321             break;
     322         }
     323 
     324         cout << "2. pid=" << pthread_self() << " sock = " << nGSock << endl;
     325         file_length = http.Fetch(strUrlLocation, &downloaded_file, &fileHead,
     326                 &location, &nSock);
     327         nCount++;
     328     }
     329 
     330     nGSock = nSock; //将新得到的套接字文件描述符给之前的套接字文件描述符,为下次重用做准备
     331 
     332     if (file_length == -1) { //其他的各种错误,这个错误的原因在http.Fetch()中
     333         cout << "!-: " << iUrl.m_sUrl << endl;
     334         //pthread_mutex_lock(&mutexMemory);
     335         if (fileHead) {
     336             free(fileHead);
     337             fileHead = NULL;
     338         }
     339         if (downloaded_file) {
     340             free(downloaded_file);
     341             downloaded_file = NULL;
     342         }
     343         //pthread_mutex_unlock(&mutexMemory);
     344 
     345         cout << "-unreach host: " << iUrl.m_sHost << endl;
     346         ;
     347         return;
     348     }
     349 
     350     if (file_length == -2) { // out of ip block .//在IP阻塞范围内
     351         //pthread_mutex_lock(&mutexMemory);
     352         if (fileHead) {
     353             free(fileHead);
     354             fileHead = NULL;
     355         }
     356         if (downloaded_file) {
     357             free(downloaded_file);
     358             downloaded_file = NULL;
     359         }
     360         //pthread_mutex_unlock(&mutexMemory);
     361 
     362         // save unreach host
     363         SaveUnreachHost(iUrl.m_sHost);
     364 
     365         cout << "-out of block host: " << iUrl.m_sHost << endl;
     366         ;
     367         return;
     368     }
     369 
     370     if (file_length == -3) { // invalid host or ip//URL的主机号是无效的主机号
     371         //pthread_mutex_lock(&mutexMemory);
     372         if (fileHead) {
     373             free(fileHead);
     374             fileHead = NULL;
     375         }
     376         if (downloaded_file) {
     377             free(downloaded_file);
     378             downloaded_file = NULL;
     379         }
     380         //pthread_mutex_unlock(&mutexMemory);
     381         cout << "-invalid host: " << iUrl.m_sHost << endl;
     382         return;
     383     }
     384 
     385     if (file_length == -4) { // MIME is image/xxx//图片类型的网页
     386         //pthread_mutex_lock(&mutexMemory);
     387         if (fileHead) {
     388             free(fileHead);
     389             fileHead = NULL;
     390         }
     391         if (downloaded_file) {
     392             free(downloaded_file);
     393             downloaded_file = NULL;
     394         }
     395         //pthread_mutex_unlock(&mutexMemory);
     396 
     397         if (m_ofsLink4HistoryFile) { //为历史网页存档准备的链接
     398             pthread_mutex_lock(&mutexLink4HistoryFile);
     399             m_ofsLink4HistoryFile << iUrl.m_sUrl << endl;
     400             ; //将该URL保存在link4History.url文件中
     401             pthread_mutex_unlock(&mutexLink4HistoryFile);
     402         }
     403 
     404         cout << "-imgage host: " << iUrl.m_sHost << endl;
     405         return;
     406     }
     407 
     408     /* still experiment
     409      char **dir;
     410      dir =  ParseRobot( downloaded_file, file_length);
     411      for( int i = 0; dir[i] != NULL ; i++){
     412      cout << dir[i] << endl;
     413      free( dir[i] );
     414      }
     415 
     416      exit(1);
     417      */
     418 
     419     // so small, maybe some unuseful info, skipped
     420     //if(file_length < 40){    // for ImgSE, 
     421     /*
     422      if(file_length < 256){    // for SE
     423      //pthread_mutex_lock(&mutexMemory);
     424      if (fileHead)
     425      {
     426      free(fileHead); fileHead=NULL;
     427      }
     428      if (downloaded_file)
     429      {
     430      free(downloaded_file); downloaded_file=NULL;
     431      }
     432      //pthread_mutex_unlock(&mutexMemory);
     433      cout << "#";
     434      return;
     435      }
     436      */
     437 
     438     // deal with normal page
     439 
     440     //处理正常的网页[网页头信息和网页体信息只要有一个是NULL,我们就认为它不是正常的网页]
     441     if (!fileHead || !downloaded_file) //不能获得网页头信息或者网页体信息
     442             {
     443         //pthread_mutex_lock(&mutexMemory);
     444         if (fileHead) {
     445             free(fileHead);
     446             fileHead = NULL;
     447         }
     448         if (downloaded_file) {
     449             free(downloaded_file);
     450             downloaded_file = NULL;
     451         }
     452         //pthread_mutex_unlock(&mutexMemory);
     453         close(nGSock);
     454         nGSock = -1;
     455         cout << "-size0 host: " << iUrl.m_sHost << endl;
     456         return;
     457     }
     458 
     459     //这里很重要,将抓取到的网页信息全部放入CPage类中
     460     CPage iPage(iUrl.m_sUrl, strUrlLocation, fileHead, downloaded_file,
     461             file_length);
     462     //pthread_mutex_lock(&mutexMemory);
     463     if (fileHead) {
     464         free(fileHead);
     465         fileHead = NULL;
     466     }
     467     if (downloaded_file) {
     468         free(downloaded_file);
     469         downloaded_file = NULL;
     470     }
     471     //pthread_mutex_unlock(&mutexMemory);
     472 
     473     //解析网页头信息
     474     iPage.ParseHeaderInfo(iPage.m_sHeader);
     475 
     476     if (iPage.m_bConnectionState == false) {
     477         close(nGSock);
     478         nGSock = -1;
     479     }
     480 
     481     // when crawling images for ImgSE, remember to comment the paragraph
     482     // when crawling plain text for SE, remember to open the paragraph
     483     // paragraph begin
     484 
     485     // iPage.m_sContentType != "text/css" &&
     486 
     487     //过滤掉不是我们想要的网页体的类型
     488     if (iPage.m_sContentType != "text/html"
     489             && iPage.m_sContentType != "text/plain"
     490             && iPage.m_sContentType != "text/xml"
     491             && iPage.m_sContentType != "application/msword"
     492             && iPage.m_sContentType != "application/pdf"
     493             && iPage.m_sContentType != "text/rtf"
     494             && iPage.m_sContentType != "application/postscript"
     495             && iPage.m_sContentType != "application/vnd.ms-execl"
     496             && iPage.m_sContentType != "application/vnd.ms-powerpoint") {
     497 
     498         cout << "-unwant type  host: " << iUrl.m_sHost << endl;
     499         return;
     500     }
     501 
     502     // paragraph end
     503 
     504     //解压缩开始
     505     //如果是gzip编码,要解压缩,然后提取超链接信息,现在门户网站的首页有增大趋势
     506     //为了加快传输速度,通常采用gzip编码压缩后传输
     507     char sUnzipContent[1024000]; //1000K<1M
     508     int nUnzipLength = 0;
     509     if (iPage.m_sContentEncoding == "gzip"
     510             && iPage.m_sContentType == "text/html") {
     511 
     512         gzFile zip;
     513         //这是一个过渡文件,将没有解压缩的网页体信息放入到这个文件中
     514         string ofsGzipName;
     515 
     516         ofsGzipName = CStrFun::itos(pthread_self()) + ".gz";
     517 
     518         //以二进制截断的方式打开文件
     519         //ios::trunc 如果文件存在,则将文件长度截断为0,并清除文件的内容,如果文件不存在,则创建该文件
     520         ofstream ofsDownloadFile(ofsGzipName.c_str(), ios::trunc | ios::binary);
     521 
     522         cout << "file_length: " << file_length << endl;
     523         ofsDownloadFile.write(iPage.m_sContent.c_str(), iPage.m_nLenContent);
     524         ofsDownloadFile.close();
     525 
     526         zip = gzopen(ofsGzipName.c_str(), "rb");
     527         if (zip == NULL) {
     528             cout << "Open zip file " << ofsGzipName.c_str() << " error."
     529                     << endl;
     530             exit(-1);
     531         }
     532 
     533         //解压缩过程,将解压缩后的网页体信息放入到缓冲区域sUnzipContent
     534         nUnzipLength = gzread(zip, sUnzipContent, 1024000);
     535         if (nUnzipLength == -1) {
     536             cout << "Read zip file " << ofsGzipName.c_str() << " error."
     537                     << endl;
     538             exit(-1);
     539         }
     540 
     541         sUnzipContent[nUnzipLength] = 0;
     542 
     543         gzclose(zip);
     544 
     545         //将解压缩后的网页体信息覆盖原来的没有解压缩的网页体信息
     546         //iPage.m_sContent.assign(sUnzipContent,nUnzipLength);
     547         //iPage.m_nLenContent=nUnzipLength;
     548     }
     549     //解压缩结束
     550 
     551     CMD5 iMD5;
     552     string strDigest;
     553 
     554     /////////////////////////////
     555     // because we can make sure the url in the setVisitedUrlMd5
     556     // is not same(we have check it before insert it to the collection),
     557     // we intert it directly.  however...
     558     //iMD5.GenerateMD5( (unsigned char*)iPage.m_sUrl.c_str(), iPage.m_sUrl.length() );
     559 
     560     //判断该URL是否在open list[setVisitedUrlMD5]中,在返回;不在加到open list中,并保存
     561     iMD5.GenerateMD5((unsigned char*) iUrl.m_sUrl.c_str(),iUrl.m_sUrl.length());
     562     strDigest = iMD5.ToString();
     563 
     564     pthread_mutex_lock(&mutexVisitedUrlMD5);
     565     if (setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end()) //已经抓取过了!
     566             {
     567         cout << "!vurl: "; //1.crawled already
     568         pthread_mutex_unlock(&mutexVisitedUrlMD5);
     569         return;
     570     }
     571 
     572     //不在setVisitedUrlMD5中,现在必须插入setVisitedUrlMD5中
     573     //因为该URL现在已经访问过了
     574     setVisitedUrlMD5.insert(strDigest);
     575     SaveVisitedUrlMD5(strDigest);
     576     pthread_mutex_unlock(&mutexVisitedUrlMD5);
     577 
     578     /////////////////////////////
     579     // whether it is a visited page
     580     // for ImgSE, should comment this paragraph
     581     // for SE, should uncomment this paragraph
     582 
     583     // begin
     584 
     585     //判断该网页体是否已经访问过,访问过返回,没有访问过加到setVisitedPageMD5集合中
     586     iMD5.GenerateMD5((unsigned char*) iPage.m_sContent.c_str(),iPage.m_sContent.length());
     587     strDigest = iMD5.ToString();
     588     pthread_mutex_lock(&mutexVisitedPageMD5);
     589     //网页体MD5同URL的关系插入到容器replicas中
     590     replicas.insert(pair<string, string>(strDigest, iPage.m_sUrl));
     591     if (setVisitedPageMD5.find(strDigest) != setVisitedPageMD5.end()) //在setVisitedPageMD5中:表明出现了镜像网页
     592             {
     593         cout << "!vpage: "; // crawled already
     594         pthread_mutex_unlock(&mutexVisitedPageMD5);
     595         return;
     596     }
     597     setVisitedPageMD5.insert(strDigest);
     598 
     599     SaveVisitedPageMD5(strDigest);
     600     pthread_mutex_unlock(&mutexVisitedPageMD5);
     601 
     602     // end
     603 
     604     cout << "+";
     605 
     606     ////////////////////
     607     // save as Tianwang format
     608     //将抓取到的网页以天网格式放到原始网页库中
     609     SaveTianwangRawData(pTianwangFile, &iUrl, &iPage);
     610 
     611     ////////////////////
     612     // save visited Urls
     613     if (iPage.m_sLocation.length() < 1) {
     614         SaveVisitedUrl(iUrl.m_sUrl);
     615     } else {
     616         SaveVisitedUrl(iPage.m_sLocation);
     617     }
     618 
     619     //return;    // just crawl seeds
     620 
     621     /////////////////////////////////////
     622     // Parse hyperlinks
     623     if (iPage.m_sContentType != "text/html") { // we can only find links in tex/html
     624         return;
     625     }
     626 
     627     /*
     628      if (iPage.ParseHyperLinks() == false){
     629      return;
     630      }
     631 
     632      SaveLink4SE( &iPage);
     633      SaveLink4History( &iPage);
     634 
     635      map<string,string>::iterator it4SE = iPage.m_mapLink4SE.begin();
     636      string str;
     637      for( ; it4SE!= iPage.m_mapLink4SE.end(); ++it4SE ){
     638      str = (*it4SE).first;
     639      AddUrl( str.c_str() );
     640 
     641      }
     642      */
     643     // using XIE Han's link parser
     644 
     645     struct uri page_uri;
     646     //FILE *tmp;
     647 
     648     //tmp = tmpfile();
     649 
     650     //fwrite(iPage.m_sContent.c_str(), iPage.m_nLenContent, 1, tmp);
     651     //fseek(tmp, 0, SEEK_SET);
     652     //fclose(tmp);
     653 
     654     pthread_mutex_lock(&mutexDetect);
     655 
     656     if (iPage.m_sLocation.empty()) {
     657         uri_parse_string(iPage.m_sUrl.c_str(), &page_uri);
     658     } else {
     659         uri_parse_string(iPage.m_sLocation.c_str(), &page_uri);
     660     }
     661 
     662     struct package p = { this, &iPage };
     663     //hlink_detect(tmp, &page_uri, onfind, &p);
     664 
     665     hlink_detect_string(iPage.m_sContent.c_str(), &page_uri, onfind, &p);
     666 
     667     struct file_arg pLinks = { &iUrl, &iPage };
     668 
     669     SaveLink4SE031121(&pLinks);
     670 
     671     // save as Link4SE format
     672     //SaveLink4SERawData(pLink4SEFile, &iUrl, &iPage);
     673 
     674     pthread_mutex_unlock(&mutexDetect);
     675 
     676     uri_destroy(&page_uri);
     677     cout << "Parse End......" << endl;
     678 
     679     return;
     680 }
     681 
     682 void SaveReplicas(const char* filename) {
     683     //ofstream ofs(filename, ios::out|ios::app);
     684     ofstream ofs(filename, ios::out | ios::binary | ios::app);
     685     if (!ofs) {
     686         cout << "error open file " << endl;
     687     }
     688     string md5;
     689 
     690     pthread_mutex_lock(&mutexReplicas);
     691     multimap<string, string, less<string> >::const_iterator it;
     692     ostringstream *oss = 0;
     693     int i = 0;
     694     for (it = replicas.begin(); it != replicas.end(); it++) {
     695         if (!md5.empty() && md5 != it->first) {
     696             if (i >= 2)
     697                 ofs << (*oss).str() << endl;
     698             //pthread_mutex_lock(&mutexMemory);
     699             delete (oss);
     700             oss = new ostringstream;
     701             //pthread_mutex_unlock(&mutexMemory);
     702             (*oss) << it->first << endl;
     703             i = 0;
     704             md5 = it->first;
     705         } else if (md5.empty()) {
     706             md5 = it->first;
     707             //pthread_mutex_lock(&mutexMemory);
     708             oss = new ostringstream;
     709             //pthread_mutex_unlock(&mutexMemory);
     710             (*oss) << it->first << endl;
     711             i = 0;
     712         }
     713         if (oss != 0)
     714             (*oss) << it->second << endl;
     715         i++;
     716     }
     717 
     718     pthread_mutex_unlock(&mutexReplicas);
     719 }
     720 
     721 ////////////////////////////////////////////////////////////////////////////
     722 // Construction/Destruction
     723 ////////////////////////////////////////////////////////////////////////////
     724 
     725 CCrawl::CCrawl() {
     726 }
     727 
     728 CCrawl::CCrawl(string inputFileName, string outputFileName) {
     729     m_sInputFileName = inputFileName;
     730     m_sOutputFileName = outputFileName; // + ".txt"
     731 }
     732 
     733 CCrawl::~CCrawl() {
     734     m_ofsVisitedUrlFile.close();
     735     m_ofsLink4SEFile.close();
     736     m_ofsLink4HistoryFile.close();
     737     m_isamFile.Close();
     738     m_ofsVisitedUrlMD5File.close();
     739     m_ofsVisitedPageMD5File.close();
     740 }
     741 
     742 /*****************************************************************
     743  ** Function name: SigTerm
     744  ** Input argv:
     745  **      --
     746  ** Output argv:
     747  **      --
     748  ** Return:
     749  ** Function Description: signal function
     750  ** Version: 1.0
     751  ** Be careful:
     752  *****************************************************************/
     753 static void SigTerm(int x) {//信号处理函数
     754     SaveUnvisitedUrl();
     755     SaveReplicas("repli");
     756 
     757     cout << "Terminated!" << endl;
     758     exit(0);
     759 }
     760 
     761 void CCrawl::GetVisitedUrlMD5() {//得到已经访问过的URL对应的MD5值,放入open list[setVisitedUrlMD5]中
     762     ifstream ifsMD5(URL_MD5_FILE.c_str(), ios::binary);
     763     if (!ifsMD5) {
     764         //cerr << "did not find " << UrlMD5_FILE << " for iutput" << endl;
     765         return;
     766     }
     767 
     768     string strMD5;
     769     while (getline(ifsMD5, strMD5)) {
     770         setVisitedUrlMD5.insert(strMD5);
     771     }
     772 
     773     ifsMD5.close();
     774     cout << "got " << setVisitedUrlMD5.size() << " md5 values of visited urls"
     775             << endl;
     776 }
     777 
     778 void CCrawl::GetVisitedPageMD5() {//得到已经访问过的web网页体对应的MD5值,放入setVisitedPageMD5中
     779     ifstream ifsMD5(PAGE_MD5_FILE.c_str(), ios::binary);
     780     if (!ifsMD5) {
     781         //cerr << "did not find " << PageMD5_FILE << " for iutput" << endl;
     782         return;
     783     }
     784 
     785     string strMD5;
     786     while (getline(ifsMD5, strMD5)) {
     787         setVisitedPageMD5.insert(strMD5);
     788     }
     789 
     790     ifsMD5.close();
     791     cout << "got " << setVisitedPageMD5.size() << " md5 values of visited pages"
     792             << endl;
     793 }
     794 
     795 void CCrawl::GetIpBlock() {//得到阻塞的IP,放入mapIpBlock容器中
     796     ifstream ifsIpBlock(IP_BLOCK_FILE.c_str());
     797     if (!ifsIpBlock) {
     798         //cerr << "Cannot open " << IP_BLOCK_FILE << " for input." << endl;
     799         return;
     800     }
     801     string strIpBlock;
     802     while (getline(ifsIpBlock, strIpBlock)) {
     803         if (strIpBlock[0] == '\0' || strIpBlock[0] == '#'
     804                 || strIpBlock[0] == '\n') {
     805 
     806             continue;
     807         }
     808 
     809         char buf1[64], buf2[64];
     810 
     811         buf1[0] = '\0';
     812         buf2[0] = '\0';
     813         sscanf(strIpBlock.c_str(), "%s %s", buf1, buf2);
     814 
     815         mapIpBlock.insert(valTypeIpBlock(inet_addr(buf1), inet_addr(buf2)));
     816     }
     817     ifsIpBlock.close();
     818 
     819 }
     820 
     821 void CCrawl::GetUnreachHostMD5() {//得到不可到达的主机号,放入setUnreachHostMD5中
     822     //vsUnreachHost.reserve(MAX_UNREACHABLE_HOST_NUM);
     823     ifstream ifsUnreachHost(UNREACH_HOST_FILE.c_str());
     824     if (!ifsUnreachHost) {
     825         cerr << "Cannot open " << UNREACH_HOST_FILE << " for input." << endl;
     826         return;
     827     }
     828 
     829     string strUnreachHost;
     830     //int i=0;
     831     while (getline(ifsUnreachHost, strUnreachHost)) {
     832         if (strUnreachHost[0] == '\0' || strUnreachHost[0] == '#'
     833                 || strUnreachHost[0] == '\n') {
     834 
     835             continue;
     836         }
     837 
     838         CStrFun::Str2Lower(strUnreachHost, strUnreachHost.size());
     839         //vsUnreachHost.push_back(strUnreachHost);
     840         CMD5 iMD5;
     841         iMD5.GenerateMD5((unsigned char*) strUnreachHost.c_str(),
     842                 strUnreachHost.size());
     843         string strDigest = iMD5.ToString();
     844         setUnreachHostMD5.insert(strDigest);
     845         //i++;
     846         //if(i == MAX_UNREACHABLE_HOST_NUM) break;
     847     }
     848 
     849     ifsUnreachHost.close();
     850 
     851 }
     852 
     853 /**************************************************************************************
     854  *  Function name: SaveTianwangRawData
     855  *  Input argv:
     856  *      --    pTianwangFile: tianwang file handle
     857  *      --    pUrl: url
     858  *      --    pPage: web page
     859  *  Output argv:
     860  *      --
     861  *  Return:
     862  *  Function Description: save raw page data as tianwang file
     863  **************************************************************************************/
     864 void CCrawl::SaveTianwangRawData(CTianwangFile *pTianwangFile, CUrl *pUrl,CPage *pPage) {//将抓取的网页以天网格式存储
     865     if (!pTianwangFile || !pUrl || !pPage) {
     866         return;
     867     }
     868 
     869     file_arg arg;
     870     arg.pUrl = pUrl;
     871     arg.pPage = pPage;
     872 
     873     // each thread writes itself, so dnnot need mutex
     874     pTianwangFile->Write((void*) &arg);
     875 }
     876 
     877 /**************************************************************************************
     878  *  Function name: SaveLink4SERawData
     879  *  Input argv:
     880  *      --    pLink4SEFile: link4SE file handle
     881  *      --    pUrl: url
     882  *      --    pPage: web page
     883  *  Output argv:
     884  *      --
     885  *  Return:
     886  *  Function Description: save raw page data as tianwang file
     887  **************************************************************************************/
     888 void CCrawl::SaveLink4SERawData(CLink4SEFile *pLink4SEFile, CUrl *pUrl,
     889         CPage *pPage) { //将抓取的网页从中提取超链接信息建立网页结构库
     890     if (!pLink4SEFile || !pUrl || !pPage) {
     891         return;
     892     }
     893 
     894     file_arg arg;
     895     arg.pUrl = pUrl;
     896     arg.pPage = pPage;
     897 
     898     // each thread writes itself, so dnnot need mutex
     899     pLink4SEFile->Write((void*) &arg);
     900 }
     901 
     902 /**************************************************************************************
     903  *  Function name: SaveIsamRawData
     904  *  Input argv:
     905  *      --    pUrl: url
     906  *      --    pPage: web page
     907  *  Output argv:
     908  *      --
     909  *  Return:
     910  *  Function Description: save raw page data as ISAM file
     911  **************************************************************************************/
     912 void CCrawl::SaveIsamRawData(CUrl *pUrl, CPage *pPage) {
     913     if (!pUrl || !pPage) {
     914         return;
     915     }
     916 
     917     file_arg arg;
     918     arg.pUrl = pUrl;
     919     arg.pPage = pPage;
     920 
     921     pthread_mutex_lock(&mutexIsamFile);
     922 
     923     m_isamFile.Write((void *) &arg);
     924 
     925     pthread_mutex_unlock(&mutexIsamFile);
     926 }
     927 
     928 /**************************************************************************************
     929  *  Function name: SaveVisitedUrl
     930  *  Input argv:
     931  *      --    url: url
     932  *  Output argv:
     933  *      --
     934  *  Return:
     935  *  Function Description: save raw the Visited Url
     936  **************************************************************************************/
     937 void CCrawl::SaveVisitedUrl(string url) {
     938     if (m_ofsVisitedUrlFile) {
     939         pthread_mutex_lock(&mutexVisitedUrlFile);
     940 
     941         m_ofsVisitedUrlFile << url << endl;
     942 
     943         pthread_mutex_unlock(&mutexVisitedUrlFile);
     944     }
     945 }
     946 
     947 void CCrawl::SaveUnreachHost(string host) {
     948     CMD5 iMD5;
     949     iMD5.GenerateMD5((unsigned char*) host.c_str(), host.size());
     950     string strDigest = iMD5.ToString();
     951     if (setUnreachHostMD5.find(strDigest) == setUnreachHostMD5.end()) {
     952         pthread_mutex_lock(&mutexUnreachHost);
     953 
     954         setUnreachHostMD5.insert(strDigest);
     955         if (m_ofsUnreachHostFile) {
     956             m_ofsUnreachHostFile << host << endl;
     957         }
     958 
     959         pthread_mutex_unlock(&mutexUnreachHost);
     960     }
     961 }
     962 
     963 void CCrawl::SaveLink4SE(CPage *iPage) {
     964     if (m_ofsLink4SEFile && iPage->m_nRefLink4SENum > 0) {
     965         pthread_mutex_lock(&mutexLink4SEFile);
     966 
     967         m_ofsLink4SEFile << "root_url: " << iPage->m_sUrl << endl;
     968         m_ofsLink4SEFile << "charset: " << iPage->m_sCharset << endl;
     969         m_ofsLink4SEFile << "number: " << iPage->m_nRefLink4SENum << endl;
     970         m_ofsLink4SEFile << "link_anchortext: " << endl;
     971 
     972         map<string, string>::iterator it4SE = iPage->m_mapLink4SE.begin();
     973         for (; it4SE != iPage->m_mapLink4SE.end(); ++it4SE) {
     974 
     975             m_ofsLink4SEFile << (*it4SE).first << '\t' << (*it4SE).second
     976                     << endl;
     977             ;
     978 
     979         }
     980 
     981         pthread_mutex_unlock(&mutexLink4SEFile);
     982     }
     983 }
     984 
     985 bool CCrawl::SaveLink4SE031121(void *arg) {
     986     if (!arg || !m_ofsLink4SEFile)
     987         return false;
     988 
     989     //pthread_mutex_lock(&mutexLink4SEFile);
     990 
     991     if (vsParsedLinks.size() == 0)
     992         return false;
     993 
     994     file_arg *pFile = (file_arg *) arg;
     995 
     996     CUrl *iUrl = pFile->pUrl;
     997     CPage *iPage = pFile->pPage;
     998 
     999     char strDownloadTime[128];
    1000     time_t tDate;
    1001 
    1002     memset(strDownloadTime, 0, 128);
    1003     time(&tDate);
    1004     strftime(strDownloadTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));
    1005 
    1006     string links;
    1007     vector<string>::iterator it = vsParsedLinks.begin();
    1008     for (; it != vsParsedLinks.end(); ++it) {
    1009         links = links + *it + "\n";
    1010     }
    1011 
    1012     m_ofsLink4SEFile << "version: 1.0\n";
    1013     if (iPage->m_sLocation.size() == 0) {
    1014         m_ofsLink4SEFile << "url: " << iPage->m_sUrl;
    1015     } else {
    1016         m_ofsLink4SEFile << "url: " << iPage->m_sLocation;
    1017         m_ofsLink4SEFile << "\norigin: " << iUrl->m_sUrl;
    1018     }
    1019 
    1020     m_ofsLink4SEFile << "\ndate: " << strDownloadTime;
    1021 
    1022     if (mapCacheHostLookup.find(iUrl->m_sHost) == mapCacheHostLookup.end()) {
    1023         m_ofsLink4SEFile << "\nip: " << iUrl->m_sHost;
    1024     } else {
    1025         m_ofsLink4SEFile << "\nip: "
    1026                 << (*(mapCacheHostLookup.find(iUrl->m_sHost))).second;
    1027     }
    1028 
    1029     m_ofsLink4SEFile << "\noutdegree: " << vsParsedLinks.size();
    1030     m_ofsLink4SEFile << "\nlength: " << iPage->m_nLenHeader + links.size() + 1
    1031             << "\n\n" << iPage->m_sHeader << "\n";
    1032     m_ofsLink4SEFile << links;
    1033     m_ofsLink4SEFile << endl;
    1034 
    1035     vsParsedLinks.clear();
    1036     //pthread_mutex_unlock(&mutexLink4SEFile);
    1037 
    1038     return true;
    1039 }
    1040 
    1041 // not well
    1042 void CCrawl::SaveLink4History(CPage *iPage) {//保存为历史网页存档准备的超链接信息
    1043     if (m_ofsLink4HistoryFile && iPage->m_nRefLink4HistoryNum > 0) {
    1044         pthread_mutex_lock(&mutexLink4HistoryFile);
    1045 
    1046         //m_ofsLink4HistoryFile << "root_url: " << iPage->m_sUrl << endl;
    1047         //m_ofsLink4HistoryFile << "charset: " << iPage->m_sCharset << endl;    
    1048         //m_ofsLink4HistoryFile << "number: " << iPage->m_nRefLink4HistoryNum << endl;
    1049         //m_ofsLink4HistoryFile << "link: " << endl;
    1050 
    1051         vector<string>::iterator it4History = iPage->m_vecLink4History.begin();
    1052         for (; it4History != iPage->m_vecLink4History.end(); ++it4History) {
    1053             string s = *it4History;
    1054             m_ofsLink4HistoryFile << s << endl;
    1055         }
    1056 
    1057         pthread_mutex_unlock(&mutexLink4HistoryFile);
    1058     }
    1059 }
    1060 
    1061 /**************************************************************************************
    1062  *  Function name: SaveVisitedUrlMd5
    1063  *  Input argv:
    1064  *      --    md5: page md5 value
    1065  *  Output argv:
    1066  *      --
    1067  *  Return:
    1068  *  Function Description: save the visited url Md5
    1069  **************************************************************************************/
    1070 void CCrawl::SaveVisitedUrlMD5(string md5) {
    1071     if (m_ofsVisitedUrlMD5File) {
    1072         m_ofsVisitedUrlMD5File << md5 << endl;
    1073     }
    1074 }
    1075 
    1076 /**************************************************************************************
    1077  *  Function name: SaveVisitedPageMd5
    1078  *  Input argv:
    1079  *      --    md5: page md5 value
    1080  *  Output argv:
    1081  *      --
    1082  *  Return:
    1083  *  Function Description: save the visited url Md5
    1084  **************************************************************************************/
    1085 void CCrawl::SaveVisitedPageMD5(string md5) {
    1086     if (m_ofsVisitedPageMD5File) {
    1087         m_ofsVisitedPageMD5File << md5 << endl;
    1088     }
    1089 }
    1090 
    1091 /**************************************************************************************
    1092  *  Function name: OpenFileForOutput
    1093  *  Input argv:
    1094  *      --
    1095  *  Output argv:
    1096  *      --
    1097  *  Return:
    1098  *  Function Description: Open the files for output
    1099  **************************************************************************************/
    1100 void CCrawl::OpenFilesForOutput() {
    1101     // open isam file for output
    1102     m_isamFile.Open(DATA_FILE_NAME, INDEX_FILE_NAME);
    1103 
    1104     // open visited.url file for output
    1105     m_ofsVisitedUrlFile.open(m_sOutputFileName.c_str(),
    1106             ios::out | ios::app | ios::binary);
    1107     if (!m_ofsVisitedUrlFile) {
    1108         cerr << "cannot open " << VISITED_FILE << " for output\n" << endl;
    1109     }
    1110 
    1111     // open link4SE.url file for output
    1112     m_ofsLink4SEFile.open(LINK4SE_FILE.c_str(),
    1113             ios::out | ios::app | ios::binary);
    1114     if (!m_ofsLink4SEFile) {
    1115         cerr << "cannot open " << LINK4SE_FILE << " for output\n" << endl;
    1116     }
    1117 
    1118     // open link4History.url file for output
    1119     m_ofsLink4HistoryFile.open(LINK4History_FILE.c_str(),
    1120             ios::out | ios::app | ios::binary);
    1121     if (!m_ofsLink4HistoryFile) {
    1122         cerr << "cannot open " << LINK4History_FILE << " for output\n" << endl;
    1123     }
    1124 
    1125     // open unreach host file for output
    1126     m_ofsUnreachHostFile.open(UNREACH_HOST_FILE.c_str(),
    1127             ios::out | ios::app | ios::binary);
    1128     if (!m_ofsUnreachHostFile) {
    1129         cerr << "cannot open " << UNREACH_HOST_FILE << " for output\n" << endl;
    1130     }
    1131 
    1132     // open visited url md5 file for output
    1133     m_ofsVisitedUrlMD5File.open(URL_MD5_FILE.c_str(),
    1134             ios::out | ios::app | ios::binary);
    1135     if (!m_ofsVisitedUrlMD5File) {
    1136         cerr << "cannot open " << URL_MD5_FILE << " for output\n" << endl;
    1137     }
    1138 
    1139     // open visited page md5 file for output
    1140     m_ofsVisitedPageMD5File.open(PAGE_MD5_FILE.c_str(),
    1141             ios::out | ios::app | ios::binary);
    1142     if (!m_ofsVisitedPageMD5File) {
    1143         cerr << "cannot open " << PAGE_MD5_FILE << " for output\n" << endl;
    1144     }
    1145 }
    1146 
    1147 /***************************************************************************************
    1148  *  Function name: DoCrawl
    1149  *  Input argv:
    1150  *      --
    1151  *  Output argv:
    1152  *      --
    1153  *  Return:
    1154  *  Function Description: the main function for crawl
    1155  *  Be careful:
    1156  ***************************************************************************************/
    1157 void CCrawl::DoCrawl() {//CCrawl类中的总控函数
    1158     /* set the signal function */
    1159     signal(SIGTERM, SigTerm);
    1160     signal(SIGKILL, SigTerm);
    1161     signal(SIGINT, SigTerm);
    1162     signal(SIGPIPE, SIG_IGN);
    1163     signal(SIGCHLD, SIG_IGN);
    1164 
    1165     // output the begin time
    1166     char strTime[128];
    1167     time_t tDate;
    1168 
    1169     memset(strTime, 0, 128);
    1170     time(&tDate);
    1171     strftime(strTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));
    1172     cout << "\n\nBegin at: " << strTime << "\n\n";
    1173 
    1174     // get the other info from file
    1175     GetVisitedUrlMD5();
    1176     GetVisitedPageMD5();
    1177 
    1178     GetIpBlock();
    1179 
    1180     GetUnreachHostMD5();
    1181 
    1182     // open the seed url file
    1183     ifstream ifsSeed(m_sInputFileName.c_str());
    1184     if (!ifsSeed) {
    1185         cerr << "Cannot open " << m_sInputFileName << " for input\n";
    1186         return;
    1187     }
    1188 
    1189     // open the files for output
    1190     OpenFilesForOutput();
    1191 
    1192     // Create thread ID structures. 
    1193     pthread_t *tids = (pthread_t*) malloc(NUM_WORKERS * sizeof(pthread_t));
    1194     if (tids == NULL) {
    1195         cerr << "malloc error" << endl;
    1196     }
    1197 
    1198     for (unsigned int i = 0; i < NUM_WORKERS; i++) {
    1199         if (pthread_create(&tids[i], NULL, start, this))
    1200             cerr << "create threads error" << endl;
    1201     }
    1202 
    1203     string strUrl;
    1204     CPage iCPage;
    1205     while (getline(ifsSeed, strUrl)) {
    1206         string::size_type idx;
    1207 
    1208         if (strUrl[0] == '\0' || strUrl[0] == '#' || strUrl[0] == '\n') {
    1209             continue;
    1210         }
    1211 
    1212         idx = strUrl.find('\t');
    1213         if (idx != string::npos) {
    1214             strUrl = strUrl.substr(0, idx);
    1215         }
    1216 
    1217         //idx = strUrl.find("http");
    1218         idx = CStrFun::FindCase(strUrl, "http");
    1219         if (idx == string::npos) {
    1220             //continue;
    1221             idx = strUrl.find('/');
    1222             if (idx == string::npos) {
    1223                 strUrl = "http://" + strUrl + "/";
    1224             } else {
    1225                 strUrl = "http://" + strUrl;
    1226             }
    1227         }
    1228 
    1229         //if( strUrl.length() < 8 ) continue;
    1230 
    1231         if (iCPage.IsFilterLink(strUrl))
    1232             continue;
    1233         AddUrl(strUrl.c_str());
    1234     }
    1235 
    1236     // Get the unvisited URL
    1237     ifstream ifsUnvisitedUrl(UNVISITED_FILE.c_str());
    1238     if (ifsUnvisitedUrl) {
    1239         while (getline(ifsUnvisitedUrl, strUrl)) {
    1240             string::size_type idx;
    1241 
    1242             if (strUrl[0] == '\0' || strUrl[0] == '#' || strUrl[0] == '\n') {
    1243                 continue;
    1244             }
    1245 
    1246             idx = strUrl.find('\t');
    1247             if (idx != string::npos) {
    1248                 strUrl = strUrl.substr(0, idx);
    1249             }
    1250 
    1251             // filter invalid urls
    1252             if (iCPage.IsFilterLink(strUrl))
    1253                 continue;
    1254 
    1255             AddUrl(strUrl.c_str());
    1256         }
    1257     } else {
    1258         //cerr << "Cannot open " << UNVISITED_FILE << " for input\n";
    1259     }
    1260 
    1261     // sleep(30);
    1262     b_fOver = true;
    1263     cout << "finished to get all unvisited urls." << endl;
    1264 
    1265     // Wait for the threads. 
    1266     for (unsigned int i = 0; i < NUM_WORKERS; ++i) {
    1267         (void) pthread_join(tids[i], NULL);
    1268     }
    1269 
    1270     cout << "closed " << NUM_WORKERS << " threads." << endl;
    1271 
    1272     SaveUnvisitedUrl();
    1273     SaveReplicas("repli");
    1274 
    1275     memset(strTime, 0, 128);
    1276     time(&tDate);
    1277     strftime(strTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate));
    1278     cout << "\n\nEnd at: " << strTime << "\n\n";
    1279 }
    1280 
    1281 /*****************************************************************
    1282  ** Function name: AddUrl
    1283  ** Input argv:
    1284  **      --
    1285  ** Output argv:
    1286  **      --
    1287  ** Return:
    1288  ** Function Description: Add a parsed url into the collection
    1289  ** Version: 1.0
    1290  ** Be careful:   An important function!!!
    1291  *****************************************************************/
    1292 void CCrawl::AddUrl(const char * url) {
    1293     string strUrl = url;
    1294     if (strUrl.empty() || strUrl.size() < 8) { //invalid url
    1295         cout << "!so small!" << strUrl << endl;
    1296         return;
    1297     }
    1298 
    1299     CPage iCPage;
    1300     if (iCPage.NormalizeUrl(strUrl) == false) {
    1301         // cout << "!normalize fail!" << strUrl << endl;
    1302         return;
    1303     }
    1304 
    1305     CUrl iUrl;
    1306 
    1307     // for ImgSE, comment the paragraph
    1308     // if image/xxx url, store it to link4History.url
    1309     // begin
    1310     if (iUrl.IsImageUrl(strUrl)) {
    1311         if (m_ofsLink4HistoryFile) {
    1312             pthread_mutex_lock(&mutexLink4HistoryFile);
    1313             m_ofsLink4HistoryFile << strUrl << endl;
    1314             ;
    1315             pthread_mutex_unlock(&mutexLink4HistoryFile);
    1316         }
    1317         return;
    1318     }
    1319     // end
    1320 
    1321     if (iUrl.ParseUrlEx(strUrl) == false) {
    1322         cout << "ParseUrlEx error in AddUrl(): " << strUrl << endl;
    1323         return;
    1324     }
    1325 
    1326     // if it is an invalid host, discard it
    1327     if (iUrl.IsValidHost(iUrl.m_sHost.c_str()) == false) {
    1328         cout << "!invalid host: " << iUrl.m_sHost << endl;
    1329         return;
    1330     }
    1331 
    1332     // filter foreign hosts
    1333     if (iUrl.IsForeignHost(iUrl.m_sHost)) {
    1334         cout << "!foreign hosts: " << iUrl.m_sHost << endl;
    1335         return;
    1336     }
    1337 
    1338     // if it is a block ip, discard it
    1339     // this work is left in the CreatSocket()
    1340     // because the work of getting ip is inevitable in the CreatSocket function
    1341     //     and this work is expensive
    1342     // if it is an unreach host, discard it
    1343     // here we only deal with numbers-and-dots notations
    1344     unsigned long inaddr = 0;
    1345     char *ip = NULL;
    1346 
    1347     inaddr = (unsigned long) inet_addr(iUrl.m_sHost.c_str());
    1348     if (inaddr != INADDR_NONE) { // host is just ip
    1349         //pthread_mutex_lock(&mutexMemory);
    1350         ip = new char[iUrl.m_sHost.size() + 1];
    1351         //pthread_mutex_unlock(&mutexMemory);
    1352         memset(ip, 0, iUrl.m_sHost.size() + 1);
    1353         memcpy(ip, iUrl.m_sHost.c_str(), iUrl.m_sHost.size());
    1354 
    1355         if (!iUrl.IsValidIp(ip)) { // out of ip block
    1356             //pthread_mutex_lock(&mutexMemory);
    1357             delete[] ip;
    1358             ip = NULL;
    1359             //pthread_mutex_unlock(&mutexMemory);
    1360             //cout << "!unreach hosts: " << iUrl.m_sHost << endl;
    1361             return;
    1362         }
    1363         //pthread_mutex_lock(&mutexMemory);
    1364         delete[] ip;
    1365         ip = NULL;
    1366         //pthread_mutex_unlock(&mutexMemory);
    1367     }
    1368 
    1369     CStrFun::Str2Lower(iUrl.m_sHost, iUrl.m_sHost.size());
    1370     CMD5 iMD5;
    1371     iMD5.GenerateMD5((unsigned char*) iUrl.m_sHost.c_str(),
    1372             iUrl.m_sHost.size());
    1373     string strDigest = iMD5.ToString();
    1374     if (setUnreachHostMD5.find(strDigest) != setUnreachHostMD5.end()) {
    1375         //cout << "!unreach host! " << iUrl.m_sHost << endl;    
    1376         return;
    1377     }
    1378 
    1379     // if crawled, discard it
    1380     iMD5.GenerateMD5((unsigned char*) strUrl.c_str(), strUrl.size());
    1381     strDigest = iMD5.ToString();
    1382 
    1383     if (setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end()) {
    1384         // cout << "!visited! " << strUrl << endl;    
    1385         return;
    1386     }
    1387 
    1388     // if already in the collection, discard it
    1389     if (setUnvisitedUrlMD5.find(strDigest) != setUnvisitedUrlMD5.end()) {
    1390         // cout << "!in collection! " << strUrl << endl;    
    1391         return;
    1392     } else {
    1393         pthread_mutex_lock(&mutexUnvisitedUrlMD5);
    1394         setUnvisitedUrlMD5.insert(strDigest);
    1395         pthread_mutex_unlock(&mutexUnvisitedUrlMD5);
    1396     }
    1397 
    1398     // add
    1399     // make sure limited threads crawling on a site
    1400     int cnt = 0;
    1401     for (;;) {
    1402         //if( mmapUrls.count(iUrl.m_sHost) < NUM_WORKERS_ON_A_SITE ){
    1403 
    1404         if (1) {
    1405             //pthread_mutex_lock(&mutexVisitedUrlMD5);
    1406 
    1407             // if crawled, discard it :) double secure
    1408             //if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) {
    1409             //cout << "!v! " << strUrl << endl;
    1410             //pthread_mutex_unlock(&mutexVisitedUrlMD5);
    1411             //return;
    1412             //} else {
    1413 
    1414             pthread_mutex_lock(&mutexVisitedUrlMD5);
    1415             mmapUrls.insert(mvalType(iUrl.m_sHost, strUrl));
    1416             pthread_mutex_unlock(&mutexVisitedUrlMD5);
    1417             break;
    1418             //}
    1419         } else {
    1420             cnt++;
    1421             if (cnt % 100 == 0) {
    1422                 cout << "~";
    1423                 //cnt = 0;
    1424             }
    1425 
    1426             // If we have waiting so long, we may remove it
    1427             if (cnt == 50000) {
    1428                 cout << "romove it!!!!!!!!!!!!!!!!!!!" << endl;
    1429                 break;
    1430             }
    1431             usleep(4000);
    1432         }
    1433 
    1434     }
    1435 
    1436 }
  • 相关阅读:
    vue学习
    BBS登录注册技术点归纳
    BBS项目模态框的使用
    django后台管理系统
    java 之 jsp简介
    http 之 CORS简介
    web 之 session
    linux 之学习路线
    Ubuntu 之 win10更新ubuntu启动项消失
    Web 之 Cookie
  • 原文地址:https://www.cnblogs.com/kakamilan/p/2579561.html
Copyright © 2011-2022 走看看