1 #ifndef _Crawl_H_031104_ 2 #define _Crawl_H_031104_ 3 4 //#include <openssl/md5.h> 5 #include <zlib.h> 6 7 #include "Tse.h" 8 #include "Http.h" 9 #include "StrFun.h" 10 #include "Url.h" 11 #include "Page.h" 12 #include "TianwangFile.h" 13 #include "IsamFile.h" 14 #include "Link4SEFile.h" 15 16 using namespace std; 17 18 class CCrawl 19 { 20 public: 21 string m_sInputFileName; //种子URL的文件名字: tse_seed.pku 22 string m_sOutputFileName; //保存我们已经访问过的URL的文件名字: visited.all 23 24 CIsamFile m_isamFile; // ISAM file handle 25 26 ofstream m_ofsVisitedUrlFile; //visited.all的文件句柄 27 ofstream m_ofsLink4SEFile; //link4SE.url的文件句柄 28 ofstream m_ofsLink4HistoryFile; //link4History.url的文件句柄 29 ofstream m_ofsUnreachHostFile; //tse_unreachHost.list的文件句柄 30 31 ofstream m_ofsVisitedUrlMD5File;//tse_md5.visitedurl的文件句柄 32 ofstream m_ofsVisitedPageMD5File;//tse_md5.visitedpage 33 34 ofstream m_ofsUnreachUrlFile; // unreach URL file handle 35 36 37 public: 38 CCrawl();//无参构造函数 "tse_seed.pku" "visited.all" 39 CCrawl(string strInputFile, string strOutputFile); 40 ~CCrawl(); 41 42 //CCrawl类中最重要的函数 43 void DoCrawl(); 44 45 //根据URL以及套接字文件描述符抓取URL对应的网页 46 void DownloadFile( CTianwangFile *pTianwangFile,CLink4SEFile *pLink4SEFile, CUrl iUrl, int& nGSock); 47 48 //每个线程函数start()都调用这个函数 49 void fetch(void *arg); 50 51 //如果url满足条件加到mmapUrls[待访问的url]容器中 52 void AddUrl(const char *url); 53 54 void GetVisitedUrlMD5();//得到已经访问过的URL对应的MD5值,放入open list[setVisitedUrlMD5]中 55 void GetVisitedPageMD5();//得到已经访问过的web网页体对应的MD5值,放入setVisitedPageMD5中 56 57 void GetIpBlock();//得到阻塞的IP,放入mapIpBlock容器中 58 59 void GetUnreachHostMD5();//得到不可到达的主机号,放入setUnreachHostMD5中 60 void OpenFilesForOutput();//打开所有的输出流 61 62 // save in the process 63 void SaveTianwangRawData(CTianwangFile *pTianwangFile,CUrl *pUrl, CPage *pPage);//将抓取的网页以天网格式存储 64 void SaveLink4SERawData(CLink4SEFile *pLink4SEFile,CUrl *pUrl, CPage *pPage);//将抓取的网页从中提取超链接信息建立网页结构库 65 66 void SaveIsamRawData(CUrl *pUrl, CPage *Page); 67 void SaveVisitedUrl(string url);//保存已经访问过的URL 68 void SaveUnreachHost(string host);//保存不可到达的主机号 69 void SaveLink4SE(CPage *Page);//保存为搜索引擎准备的超链接信息 70 bool SaveLink4SE031121(void *arg); 71 void SaveLink4History(CPage *Page)//保存为历史网页存档准备的超链接信息 72 73 // save while the program running 74 void SaveVisitedUrlMD5(string md5);//保存已经访问过的URL对应的MD5值 75 void SaveVisitedPageMD5(string md5);//得到已经访问过的web网页体对应的MD5值 76 77 }; 78 79 #endif /* _CRAWL_H_031104_ */
1 #include "Crawl.h" 2 #include "Url.h" 3 #include "Md5.h" 4 5 #include <list.h> 6 #include <hlink.h> 7 #include <uri.h> 8 9 extern pthread_mutex_t mymutex; 10 extern map<string, string> mapCacheHostLookup; //DNS缓存 11 extern vector<string> vsUnreachHost; 12 extern char **ParseRobot(char *data, char len); 13 14 set<string> setVisitedUrlMD5; //open list[已经访问的URL对应的MD5值] 15 set<string> setVisitedPageMD5; //已经访问过的web网页体对应的MD5值 16 set<string> setUnvisitedUrlMD5; //close list[没有访问过的URL对应的MD5值] 17 18 set<string> setUnreachHostMD5; //不可达到的主机号对应的MD5值的集合 19 20 multimap<string, string, less<string> > replicas; //web网页体对应的MD5值<->web网页体对应的URL 21 22 //定义线程的互斥变量并初始化 23 pthread_mutex_t mutexCollection = PTHREAD_MUTEX_INITIALIZER; //保护mmapUrls资源 24 pthread_mutex_t mutexUnreachHost = PTHREAD_MUTEX_INITIALIZER; //保护setUnreachHostMD5&&m_ofsUnreachHostFile资源 25 pthread_mutex_t mutexUnvisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER; //保护setUnvisitedUrlMD5资源 26 pthread_mutex_t mutexVisitedUrlMD5 = PTHREAD_MUTEX_INITIALIZER; //保护setVisiteUrlMD5&&m_ofsVisiteUrlMD5File资源 27 pthread_mutex_t mutexVisitedPageMD5 = PTHREAD_MUTEX_INITIALIZER; //保护setVisitePageMD5&&m_ofsVisitePageMD5File资源 28 29 pthread_mutex_t mutexDetect = PTHREAD_MUTEX_INITIALIZER; 30 pthread_mutex_t mutexLink4SEFile = PTHREAD_MUTEX_INITIALIZER; //保护m_ofsLink4SEFile资源 31 pthread_mutex_t mutexLink4HistoryFile = PTHREAD_MUTEX_INITIALIZER; 32 pthread_mutex_t mutexIsamFile = PTHREAD_MUTEX_INITIALIZER; 33 pthread_mutex_t mutexVisitedUrlFile = PTHREAD_MUTEX_INITIALIZER; 34 pthread_mutex_t mutexUnreachHostFile = PTHREAD_MUTEX_INITIALIZER; 35 pthread_mutex_t mutexReplicas = PTHREAD_MUTEX_INITIALIZER; 36 //pthread_mutex_t mutexMemory = PTHREAD_MUTEX_INITIALIZER; 37 38 map<unsigned long, unsigned long> mapIpBlock; //IP阻塞范围 39 bool b_fOver; //线程运行控制参数 40 //multimap<string,string, less<string> > mmapUrls; 41 multimap<string, string> mmapUrls; //保存没有访问过的URL的主机号<->对应的URL 42 43 typedef map<unsigned long, unsigned long>::value_type valTypeIpBlock; 44 typedef map<string, string>::value_type mvalType; 45 46 void SaveReplicas(const char* filename); //保存镜像网页对应的URL的一个值到指定的文件名中 47 48 struct package { 49 CCrawl *crawl; 50 CPage *page; 51 }; 52 53 vector<string> vsParsedLinks; 54 55 int onfind(const char *elem, const char *attr, struct uri *uri, void *arg) { 56 struct package *p = (struct package*) arg; 57 char buff[URL_LEN + 1]; 58 59 if (uri_recombine(uri, buff, URL_LEN + 1, 60 C_SCHEME | C_AUTHORITY | C_PATH | C_QUERY) >= 0) 61 62 { 63 vsParsedLinks.push_back(buff); 64 if (!p->page->IsFilterLink(buff)) { 65 // accept "a,link,frame,iframe,img,area" 66 67 if (strcasecmp(elem, "img") == 0) { 68 pthread_mutex_lock(&mutexLink4HistoryFile); 69 if (p->crawl->m_ofsLink4HistoryFile) { 70 p->crawl->m_ofsLink4HistoryFile << buff << endl; 71 } 72 pthread_mutex_unlock(&mutexLink4HistoryFile); 73 74 } else { 75 p->crawl->AddUrl(buff); 76 } 77 /* 78 else if (strcasecmp(elem, "img") == 0) 79 { 80 pthread_mutex_lock(&mutexLink4HistoryFile); 81 if( p->crawl->m_ofsLink4HistoryFile ){ 82 p->crawl->m_ofsLink4HistoryFile << p->page->m_sUrl << endl;; 83 } 84 pthread_mutex_unlock(&mutexLink4HistoryFile); 85 } 86 */ 87 } 88 } 89 90 uri_destroy(uri); 91 free(uri); 92 return 1; 93 } 94 95 /*********************************************************************** 96 * Function name: start 97 * Input argv: 98 * -- arg: the CCrawl handle 99 * Output argv: 100 * -- 101 * Return: 102 ***********************************************************************/ 103 //线程函数-->每个线程函数调用fetch(void*arg)函数 104 void* start(void *arg) { 105 ((CCrawl*) arg)->fetch(arg); 106 } 107 108 /* 109 这个函数设计的很巧妙,这里说的巧妙不是函数写有多hi: 110 我们知道spider在最开始抓取网页的时候需要种子url, 111 我们这个spider的种子url文件库是tse_seed.ur文件 112 而这个函数正好在我们强制中断程序的时候,将mmapUrls 113 中没有访问完的url放入tse_unvisited.url文件中, 114 扩充了我们的种子URL库!*/ 115 void SaveUnvisitedUrl() { 116 ofstream ofsUnvisitedUrl; 117 ofsUnvisitedUrl.open(UNVISITED_FILE.c_str(), 118 ios::in | ios::out | ios::trunc | ios::binary); //以二进制可追加写方式打开文件 119 if (!ofsUnvisitedUrl) //打开失败 120 { 121 cerr << "cannot open " << UNVISITED_FILE << "for output" << endl; 122 exit(-1); 123 } 124 125 //将mmapUrls中没有访问完的url放入tse_unvisited.url文件中,扩充了我们的URL种子库! 126 multimap<string, string>::iterator it = mmapUrls.begin(); 127 for (; it != mmapUrls.end(); it++) { 128 ofsUnvisitedUrl << ((*it).second).c_str() << "\n"; 129 } 130 131 ofsUnvisitedUrl << endl; 132 ofsUnvisitedUrl.close(); 133 134 } 135 136 /*********************************************************************** 137 * Function name: fetch 138 * Input argv: 139 * -- arg: the CCrawl handle 140 * Output argv: 141 * -- 142 * Return: 143 ***********************************************************************/ 144 void CCrawl::fetch(void *arg) //每个线程都执行这个函数 145 { 146 string strUrl, host; 147 148 int nGSock = -1; //之前的套接字文件描述符 149 string strGHost = ""; //字前的主机号 150 151 // create a Tianwang file for output the raw page data 152 string ofsName = DATA_TIANWANG_FILE + "." + CStrFun::itos(pthread_self()); //Tianwang.raw+"线程号" 153 CTianwangFile tianwangFile(ofsName); //创建一个天网格式的文件,保存为原始网页库 154 155 // create a Link4SE file for output the raw link data 156 ofsName = DATA_LINK4SE_FILE + "." + CStrFun::itos(pthread_self()); //Link4SE.raw+"线程号" 157 CLink4SEFile link4SEFile(ofsName); //创建一个网页结构库 158 159 int iSleepCnt = 0; //线程运行控制参数 160 for (;;) { 161 pthread_mutex_lock(&mutexCollection); //互斥的锁定函数 162 //if( !mmapUrls.empty() ){ 163 int cnt = mmapUrls.size(); 164 if (cnt > 0) { 165 //已经收集的没有访问的url 166 cout << "collection has: " << cnt << " unvisited urls" << endl; 167 multimap<string, string>::iterator it = mmapUrls.begin(); 168 if (it != mmapUrls.end()) { 169 // get an URL 170 strUrl = (*it).second; //从待访问的URL队列中得到一个URL进行访问 171 172 // remove it from the collection 173 mmapUrls.erase(it); //删除迭代器所指的元素 174 175 pthread_mutex_unlock(&mutexCollection); //互斥的解锁函数 176 177 // parse URL 178 CUrl iUrl; //关键是看看strUrl是否有http://协议号,没有返回false 179 if (iUrl.ParseUrlEx(strUrl) == false) { 180 cout << "ParseUrlEx error in fetch(): " << strUrl << endl; 181 continue; 182 } 183 184 //表明现在抓取的网页所在的主机,同之前抓取的网页所在的主机不同 185 //故,我们不能利用之前的套接字文件描述符进行CS通信,必须创建新的 186 //套接字文件描述符进行通信,这是由于循环导致的 187 if (strGHost != iUrl.m_sHost) { 188 close(nGSock); 189 nGSock = -1; 190 strGHost = iUrl.m_sHost; 191 } 192 193 //根据URL以及套接字文件描述符抓取URL对应的网页,并保存为原始网页库和网页结构库 194 ((CCrawl*) arg)->DownloadFile(&tianwangFile, &link4SEFile, iUrl, 195 nGSock); 196 197 cnt = 0; 198 } else { 199 pthread_mutex_unlock(&mutexCollection); 200 } 201 } else { 202 //待访问的URL队列mmapUrls中没有URL了,这个时候我们必须挂起线程进行等待 203 pthread_mutex_unlock(&mutexCollection); 204 usleep(1000); 205 iSleepCnt++; 206 } 207 208 if (b_fOver == true && iSleepCnt == 200) //当URL队列mmapUrls有200次都是空的时候就结束这个线程调用的fetch()函数 209 break; 210 /* 211 if( b_fOver == true ){ 212 break; 213 } else if( cnt == 100 ) { 214 cout << "w."; 215 cnt = 0; 216 } 217 */ 218 } 219 220 tianwangFile.Close(); 221 link4SEFile.Close(); 222 } 223 224 /*********************************************************************** 225 * Function name: DownloadFile 226 * Input argv: 227 * -- pTianwang: the CCrawl handle 228 * -- pLink4SE: the CCrawl handle 229 * -- iUrl: the URL for crawling 230 * -- nGSock: the previous global socket 231 * Output argv: 232 * -- 233 * Return: 234 ***********************************************************************/ 235 236 //根据URL以及套接字文件描述符抓取URL对应的网页,并保存为原始网页库和网页结构库 237 void CCrawl::DownloadFile(CTianwangFile *pTianwangFile, 238 CLink4SEFile *pLink4SEFile, CUrl iUrl, int& nGSock) { 239 char *downloaded_file = NULL, //网页体信息 240 *fileHead = NULL, //网页头信息 241 *location = NULL; //网页的重定向信息 242 int file_length = 0; //网页体真实的字节长度 243 string strUrlLocation = ""; //保存网页的重定向超链接 244 245 //之后请求的网页和之前请求的网页位于同一个主机上,我们可以利用之前的套接字文件描述符进行通信,这样我们可以节约带宽,节省时间 246 int nSock = nGSock; //将之前的套接字文件描述符赋值给nSock 247 248 cout << "1. pid=" << pthread_self() << " sock = " << nGSock << endl; 249 250 CHttp http; 251 252 //这是一个真正的抓取网页的函数,有了URL搜集系统可以根据URL的标识抓取其对应的网页 253 file_length = http.Fetch(iUrl.m_sUrl, &downloaded_file, &fileHead, 254 &location, &nSock); 255 256 int nCount = 0; //用来标识URL重定向的次数,如果重定向了3次,我们就不要抓取它对应的网页 257 258 while (file_length == -300) //表明该iUrl.m_sUrl对应的网页重定向了 259 { // moved to an another place 260 if (strlen(location) > URL_LEN - 1 || nCount == 3 261 || strlen(location) == 0) { 262 if (location) { 263 //pthread_mutex_lock(&mutexMemory); 264 free(location); 265 location = NULL; 266 //pthread_mutex_unlock(&mutexMemory); 267 } 268 file_length = -1; 269 break; 270 } 271 272 //将获取到的重定向的URL给strUrlLocation为下次抓取网页做准备 273 strUrlLocation = location; 274 if (location) { 275 //pthread_mutex_lock(&mutexMemory); 276 free(location); 277 location = NULL; 278 //pthread_mutex_unlock(&mutexMemory); 279 } 280 281 //这个地方要注意,因为重定向的URL可能是相对路径,所以我们必须将它转化为绝对路径 282 //跟CPage类中提取超链接信息一样 283 string::size_type idx1 = CStrFun::FindCase(strUrlLocation, "http"); 284 285 if (idx1 != 0) { //没有找"http://"协议号 286 287 char c1 = iUrl.m_sUrl.at(iUrl.m_sUrl.length() - 1); 288 char c2 = strUrlLocation.at(0); 289 290 if (c2 == '/') //重定向的URL一定是相对路径 291 { 292 strUrlLocation = "http://" + iUrl.m_sHost + strUrlLocation; 293 } else if (c1 != '/' && c2 != '/') { 294 string::size_type idx; 295 296 idx = iUrl.m_sUrl.rfind('/'); 297 if (idx != string::npos) { 298 if (idx > 6) { // > strlen("http://..") 299 strUrlLocation = iUrl.m_sUrl.substr(0, idx + 1) 300 + strUrlLocation; 301 } else { 302 strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation; 303 } 304 305 } else { 306 file_length = -1; 307 break; 308 } 309 } else { 310 if (c1 == '/') { 311 strUrlLocation = iUrl.m_sUrl + strUrlLocation; 312 } else { 313 strUrlLocation = iUrl.m_sUrl + "/" + strUrlLocation; 314 } 315 } 316 } 317 318 CPage iPage; 319 if (iPage.IsFilterLink(strUrlLocation)) { //如果得到的重定向URL是要过滤的URL,我们立刻结束不再抓取 320 file_length = -1; 321 break; 322 } 323 324 cout << "2. pid=" << pthread_self() << " sock = " << nGSock << endl; 325 file_length = http.Fetch(strUrlLocation, &downloaded_file, &fileHead, 326 &location, &nSock); 327 nCount++; 328 } 329 330 nGSock = nSock; //将新得到的套接字文件描述符给之前的套接字文件描述符,为下次重用做准备 331 332 if (file_length == -1) { //其他的各种错误,这个错误的原因在http.Fetch()中 333 cout << "!-: " << iUrl.m_sUrl << endl; 334 //pthread_mutex_lock(&mutexMemory); 335 if (fileHead) { 336 free(fileHead); 337 fileHead = NULL; 338 } 339 if (downloaded_file) { 340 free(downloaded_file); 341 downloaded_file = NULL; 342 } 343 //pthread_mutex_unlock(&mutexMemory); 344 345 cout << "-unreach host: " << iUrl.m_sHost << endl; 346 ; 347 return; 348 } 349 350 if (file_length == -2) { // out of ip block .//在IP阻塞范围内 351 //pthread_mutex_lock(&mutexMemory); 352 if (fileHead) { 353 free(fileHead); 354 fileHead = NULL; 355 } 356 if (downloaded_file) { 357 free(downloaded_file); 358 downloaded_file = NULL; 359 } 360 //pthread_mutex_unlock(&mutexMemory); 361 362 // save unreach host 363 SaveUnreachHost(iUrl.m_sHost); 364 365 cout << "-out of block host: " << iUrl.m_sHost << endl; 366 ; 367 return; 368 } 369 370 if (file_length == -3) { // invalid host or ip//URL的主机号是无效的主机号 371 //pthread_mutex_lock(&mutexMemory); 372 if (fileHead) { 373 free(fileHead); 374 fileHead = NULL; 375 } 376 if (downloaded_file) { 377 free(downloaded_file); 378 downloaded_file = NULL; 379 } 380 //pthread_mutex_unlock(&mutexMemory); 381 cout << "-invalid host: " << iUrl.m_sHost << endl; 382 return; 383 } 384 385 if (file_length == -4) { // MIME is image/xxx//图片类型的网页 386 //pthread_mutex_lock(&mutexMemory); 387 if (fileHead) { 388 free(fileHead); 389 fileHead = NULL; 390 } 391 if (downloaded_file) { 392 free(downloaded_file); 393 downloaded_file = NULL; 394 } 395 //pthread_mutex_unlock(&mutexMemory); 396 397 if (m_ofsLink4HistoryFile) { //为历史网页存档准备的链接 398 pthread_mutex_lock(&mutexLink4HistoryFile); 399 m_ofsLink4HistoryFile << iUrl.m_sUrl << endl; 400 ; //将该URL保存在link4History.url文件中 401 pthread_mutex_unlock(&mutexLink4HistoryFile); 402 } 403 404 cout << "-imgage host: " << iUrl.m_sHost << endl; 405 return; 406 } 407 408 /* still experiment 409 char **dir; 410 dir = ParseRobot( downloaded_file, file_length); 411 for( int i = 0; dir[i] != NULL ; i++){ 412 cout << dir[i] << endl; 413 free( dir[i] ); 414 } 415 416 exit(1); 417 */ 418 419 // so small, maybe some unuseful info, skipped 420 //if(file_length < 40){ // for ImgSE, 421 /* 422 if(file_length < 256){ // for SE 423 //pthread_mutex_lock(&mutexMemory); 424 if (fileHead) 425 { 426 free(fileHead); fileHead=NULL; 427 } 428 if (downloaded_file) 429 { 430 free(downloaded_file); downloaded_file=NULL; 431 } 432 //pthread_mutex_unlock(&mutexMemory); 433 cout << "#"; 434 return; 435 } 436 */ 437 438 // deal with normal page 439 440 //处理正常的网页[网页头信息和网页体信息只要有一个是NULL,我们就认为它不是正常的网页] 441 if (!fileHead || !downloaded_file) //不能获得网页头信息或者网页体信息 442 { 443 //pthread_mutex_lock(&mutexMemory); 444 if (fileHead) { 445 free(fileHead); 446 fileHead = NULL; 447 } 448 if (downloaded_file) { 449 free(downloaded_file); 450 downloaded_file = NULL; 451 } 452 //pthread_mutex_unlock(&mutexMemory); 453 close(nGSock); 454 nGSock = -1; 455 cout << "-size0 host: " << iUrl.m_sHost << endl; 456 return; 457 } 458 459 //这里很重要,将抓取到的网页信息全部放入CPage类中 460 CPage iPage(iUrl.m_sUrl, strUrlLocation, fileHead, downloaded_file, 461 file_length); 462 //pthread_mutex_lock(&mutexMemory); 463 if (fileHead) { 464 free(fileHead); 465 fileHead = NULL; 466 } 467 if (downloaded_file) { 468 free(downloaded_file); 469 downloaded_file = NULL; 470 } 471 //pthread_mutex_unlock(&mutexMemory); 472 473 //解析网页头信息 474 iPage.ParseHeaderInfo(iPage.m_sHeader); 475 476 if (iPage.m_bConnectionState == false) { 477 close(nGSock); 478 nGSock = -1; 479 } 480 481 // when crawling images for ImgSE, remember to comment the paragraph 482 // when crawling plain text for SE, remember to open the paragraph 483 // paragraph begin 484 485 // iPage.m_sContentType != "text/css" && 486 487 //过滤掉不是我们想要的网页体的类型 488 if (iPage.m_sContentType != "text/html" 489 && iPage.m_sContentType != "text/plain" 490 && iPage.m_sContentType != "text/xml" 491 && iPage.m_sContentType != "application/msword" 492 && iPage.m_sContentType != "application/pdf" 493 && iPage.m_sContentType != "text/rtf" 494 && iPage.m_sContentType != "application/postscript" 495 && iPage.m_sContentType != "application/vnd.ms-execl" 496 && iPage.m_sContentType != "application/vnd.ms-powerpoint") { 497 498 cout << "-unwant type host: " << iUrl.m_sHost << endl; 499 return; 500 } 501 502 // paragraph end 503 504 //解压缩开始 505 //如果是gzip编码,要解压缩,然后提取超链接信息,现在门户网站的首页有增大趋势 506 //为了加快传输速度,通常采用gzip编码压缩后传输 507 char sUnzipContent[1024000]; //1000K<1M 508 int nUnzipLength = 0; 509 if (iPage.m_sContentEncoding == "gzip" 510 && iPage.m_sContentType == "text/html") { 511 512 gzFile zip; 513 //这是一个过渡文件,将没有解压缩的网页体信息放入到这个文件中 514 string ofsGzipName; 515 516 ofsGzipName = CStrFun::itos(pthread_self()) + ".gz"; 517 518 //以二进制截断的方式打开文件 519 //ios::trunc 如果文件存在,则将文件长度截断为0,并清除文件的内容,如果文件不存在,则创建该文件 520 ofstream ofsDownloadFile(ofsGzipName.c_str(), ios::trunc | ios::binary); 521 522 cout << "file_length: " << file_length << endl; 523 ofsDownloadFile.write(iPage.m_sContent.c_str(), iPage.m_nLenContent); 524 ofsDownloadFile.close(); 525 526 zip = gzopen(ofsGzipName.c_str(), "rb"); 527 if (zip == NULL) { 528 cout << "Open zip file " << ofsGzipName.c_str() << " error." 529 << endl; 530 exit(-1); 531 } 532 533 //解压缩过程,将解压缩后的网页体信息放入到缓冲区域sUnzipContent 534 nUnzipLength = gzread(zip, sUnzipContent, 1024000); 535 if (nUnzipLength == -1) { 536 cout << "Read zip file " << ofsGzipName.c_str() << " error." 537 << endl; 538 exit(-1); 539 } 540 541 sUnzipContent[nUnzipLength] = 0; 542 543 gzclose(zip); 544 545 //将解压缩后的网页体信息覆盖原来的没有解压缩的网页体信息 546 //iPage.m_sContent.assign(sUnzipContent,nUnzipLength); 547 //iPage.m_nLenContent=nUnzipLength; 548 } 549 //解压缩结束 550 551 CMD5 iMD5; 552 string strDigest; 553 554 ///////////////////////////// 555 // because we can make sure the url in the setVisitedUrlMd5 556 // is not same(we have check it before insert it to the collection), 557 // we intert it directly. however... 558 //iMD5.GenerateMD5( (unsigned char*)iPage.m_sUrl.c_str(), iPage.m_sUrl.length() ); 559 560 //判断该URL是否在open list[setVisitedUrlMD5]中,在返回;不在加到open list中,并保存 561 iMD5.GenerateMD5((unsigned char*) iUrl.m_sUrl.c_str(),iUrl.m_sUrl.length()); 562 strDigest = iMD5.ToString(); 563 564 pthread_mutex_lock(&mutexVisitedUrlMD5); 565 if (setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end()) //已经抓取过了! 566 { 567 cout << "!vurl: "; //1.crawled already 568 pthread_mutex_unlock(&mutexVisitedUrlMD5); 569 return; 570 } 571 572 //不在setVisitedUrlMD5中,现在必须插入setVisitedUrlMD5中 573 //因为该URL现在已经访问过了 574 setVisitedUrlMD5.insert(strDigest); 575 SaveVisitedUrlMD5(strDigest); 576 pthread_mutex_unlock(&mutexVisitedUrlMD5); 577 578 ///////////////////////////// 579 // whether it is a visited page 580 // for ImgSE, should comment this paragraph 581 // for SE, should uncomment this paragraph 582 583 // begin 584 585 //判断该网页体是否已经访问过,访问过返回,没有访问过加到setVisitedPageMD5集合中 586 iMD5.GenerateMD5((unsigned char*) iPage.m_sContent.c_str(),iPage.m_sContent.length()); 587 strDigest = iMD5.ToString(); 588 pthread_mutex_lock(&mutexVisitedPageMD5); 589 //网页体MD5同URL的关系插入到容器replicas中 590 replicas.insert(pair<string, string>(strDigest, iPage.m_sUrl)); 591 if (setVisitedPageMD5.find(strDigest) != setVisitedPageMD5.end()) //在setVisitedPageMD5中:表明出现了镜像网页 592 { 593 cout << "!vpage: "; // crawled already 594 pthread_mutex_unlock(&mutexVisitedPageMD5); 595 return; 596 } 597 setVisitedPageMD5.insert(strDigest); 598 599 SaveVisitedPageMD5(strDigest); 600 pthread_mutex_unlock(&mutexVisitedPageMD5); 601 602 // end 603 604 cout << "+"; 605 606 //////////////////// 607 // save as Tianwang format 608 //将抓取到的网页以天网格式放到原始网页库中 609 SaveTianwangRawData(pTianwangFile, &iUrl, &iPage); 610 611 //////////////////// 612 // save visited Urls 613 if (iPage.m_sLocation.length() < 1) { 614 SaveVisitedUrl(iUrl.m_sUrl); 615 } else { 616 SaveVisitedUrl(iPage.m_sLocation); 617 } 618 619 //return; // just crawl seeds 620 621 ///////////////////////////////////// 622 // Parse hyperlinks 623 if (iPage.m_sContentType != "text/html") { // we can only find links in tex/html 624 return; 625 } 626 627 /* 628 if (iPage.ParseHyperLinks() == false){ 629 return; 630 } 631 632 SaveLink4SE( &iPage); 633 SaveLink4History( &iPage); 634 635 map<string,string>::iterator it4SE = iPage.m_mapLink4SE.begin(); 636 string str; 637 for( ; it4SE!= iPage.m_mapLink4SE.end(); ++it4SE ){ 638 str = (*it4SE).first; 639 AddUrl( str.c_str() ); 640 641 } 642 */ 643 // using XIE Han's link parser 644 645 struct uri page_uri; 646 //FILE *tmp; 647 648 //tmp = tmpfile(); 649 650 //fwrite(iPage.m_sContent.c_str(), iPage.m_nLenContent, 1, tmp); 651 //fseek(tmp, 0, SEEK_SET); 652 //fclose(tmp); 653 654 pthread_mutex_lock(&mutexDetect); 655 656 if (iPage.m_sLocation.empty()) { 657 uri_parse_string(iPage.m_sUrl.c_str(), &page_uri); 658 } else { 659 uri_parse_string(iPage.m_sLocation.c_str(), &page_uri); 660 } 661 662 struct package p = { this, &iPage }; 663 //hlink_detect(tmp, &page_uri, onfind, &p); 664 665 hlink_detect_string(iPage.m_sContent.c_str(), &page_uri, onfind, &p); 666 667 struct file_arg pLinks = { &iUrl, &iPage }; 668 669 SaveLink4SE031121(&pLinks); 670 671 // save as Link4SE format 672 //SaveLink4SERawData(pLink4SEFile, &iUrl, &iPage); 673 674 pthread_mutex_unlock(&mutexDetect); 675 676 uri_destroy(&page_uri); 677 cout << "Parse End......" << endl; 678 679 return; 680 } 681 682 void SaveReplicas(const char* filename) { 683 //ofstream ofs(filename, ios::out|ios::app); 684 ofstream ofs(filename, ios::out | ios::binary | ios::app); 685 if (!ofs) { 686 cout << "error open file " << endl; 687 } 688 string md5; 689 690 pthread_mutex_lock(&mutexReplicas); 691 multimap<string, string, less<string> >::const_iterator it; 692 ostringstream *oss = 0; 693 int i = 0; 694 for (it = replicas.begin(); it != replicas.end(); it++) { 695 if (!md5.empty() && md5 != it->first) { 696 if (i >= 2) 697 ofs << (*oss).str() << endl; 698 //pthread_mutex_lock(&mutexMemory); 699 delete (oss); 700 oss = new ostringstream; 701 //pthread_mutex_unlock(&mutexMemory); 702 (*oss) << it->first << endl; 703 i = 0; 704 md5 = it->first; 705 } else if (md5.empty()) { 706 md5 = it->first; 707 //pthread_mutex_lock(&mutexMemory); 708 oss = new ostringstream; 709 //pthread_mutex_unlock(&mutexMemory); 710 (*oss) << it->first << endl; 711 i = 0; 712 } 713 if (oss != 0) 714 (*oss) << it->second << endl; 715 i++; 716 } 717 718 pthread_mutex_unlock(&mutexReplicas); 719 } 720 721 //////////////////////////////////////////////////////////////////////////// 722 // Construction/Destruction 723 //////////////////////////////////////////////////////////////////////////// 724 725 CCrawl::CCrawl() { 726 } 727 728 CCrawl::CCrawl(string inputFileName, string outputFileName) { 729 m_sInputFileName = inputFileName; 730 m_sOutputFileName = outputFileName; // + ".txt" 731 } 732 733 CCrawl::~CCrawl() { 734 m_ofsVisitedUrlFile.close(); 735 m_ofsLink4SEFile.close(); 736 m_ofsLink4HistoryFile.close(); 737 m_isamFile.Close(); 738 m_ofsVisitedUrlMD5File.close(); 739 m_ofsVisitedPageMD5File.close(); 740 } 741 742 /***************************************************************** 743 ** Function name: SigTerm 744 ** Input argv: 745 ** -- 746 ** Output argv: 747 ** -- 748 ** Return: 749 ** Function Description: signal function 750 ** Version: 1.0 751 ** Be careful: 752 *****************************************************************/ 753 static void SigTerm(int x) {//信号处理函数 754 SaveUnvisitedUrl(); 755 SaveReplicas("repli"); 756 757 cout << "Terminated!" << endl; 758 exit(0); 759 } 760 761 void CCrawl::GetVisitedUrlMD5() {//得到已经访问过的URL对应的MD5值,放入open list[setVisitedUrlMD5]中 762 ifstream ifsMD5(URL_MD5_FILE.c_str(), ios::binary); 763 if (!ifsMD5) { 764 //cerr << "did not find " << UrlMD5_FILE << " for iutput" << endl; 765 return; 766 } 767 768 string strMD5; 769 while (getline(ifsMD5, strMD5)) { 770 setVisitedUrlMD5.insert(strMD5); 771 } 772 773 ifsMD5.close(); 774 cout << "got " << setVisitedUrlMD5.size() << " md5 values of visited urls" 775 << endl; 776 } 777 778 void CCrawl::GetVisitedPageMD5() {//得到已经访问过的web网页体对应的MD5值,放入setVisitedPageMD5中 779 ifstream ifsMD5(PAGE_MD5_FILE.c_str(), ios::binary); 780 if (!ifsMD5) { 781 //cerr << "did not find " << PageMD5_FILE << " for iutput" << endl; 782 return; 783 } 784 785 string strMD5; 786 while (getline(ifsMD5, strMD5)) { 787 setVisitedPageMD5.insert(strMD5); 788 } 789 790 ifsMD5.close(); 791 cout << "got " << setVisitedPageMD5.size() << " md5 values of visited pages" 792 << endl; 793 } 794 795 void CCrawl::GetIpBlock() {//得到阻塞的IP,放入mapIpBlock容器中 796 ifstream ifsIpBlock(IP_BLOCK_FILE.c_str()); 797 if (!ifsIpBlock) { 798 //cerr << "Cannot open " << IP_BLOCK_FILE << " for input." << endl; 799 return; 800 } 801 string strIpBlock; 802 while (getline(ifsIpBlock, strIpBlock)) { 803 if (strIpBlock[0] == '\0' || strIpBlock[0] == '#' 804 || strIpBlock[0] == '\n') { 805 806 continue; 807 } 808 809 char buf1[64], buf2[64]; 810 811 buf1[0] = '\0'; 812 buf2[0] = '\0'; 813 sscanf(strIpBlock.c_str(), "%s %s", buf1, buf2); 814 815 mapIpBlock.insert(valTypeIpBlock(inet_addr(buf1), inet_addr(buf2))); 816 } 817 ifsIpBlock.close(); 818 819 } 820 821 void CCrawl::GetUnreachHostMD5() {//得到不可到达的主机号,放入setUnreachHostMD5中 822 //vsUnreachHost.reserve(MAX_UNREACHABLE_HOST_NUM); 823 ifstream ifsUnreachHost(UNREACH_HOST_FILE.c_str()); 824 if (!ifsUnreachHost) { 825 cerr << "Cannot open " << UNREACH_HOST_FILE << " for input." << endl; 826 return; 827 } 828 829 string strUnreachHost; 830 //int i=0; 831 while (getline(ifsUnreachHost, strUnreachHost)) { 832 if (strUnreachHost[0] == '\0' || strUnreachHost[0] == '#' 833 || strUnreachHost[0] == '\n') { 834 835 continue; 836 } 837 838 CStrFun::Str2Lower(strUnreachHost, strUnreachHost.size()); 839 //vsUnreachHost.push_back(strUnreachHost); 840 CMD5 iMD5; 841 iMD5.GenerateMD5((unsigned char*) strUnreachHost.c_str(), 842 strUnreachHost.size()); 843 string strDigest = iMD5.ToString(); 844 setUnreachHostMD5.insert(strDigest); 845 //i++; 846 //if(i == MAX_UNREACHABLE_HOST_NUM) break; 847 } 848 849 ifsUnreachHost.close(); 850 851 } 852 853 /************************************************************************************** 854 * Function name: SaveTianwangRawData 855 * Input argv: 856 * -- pTianwangFile: tianwang file handle 857 * -- pUrl: url 858 * -- pPage: web page 859 * Output argv: 860 * -- 861 * Return: 862 * Function Description: save raw page data as tianwang file 863 **************************************************************************************/ 864 void CCrawl::SaveTianwangRawData(CTianwangFile *pTianwangFile, CUrl *pUrl,CPage *pPage) {//将抓取的网页以天网格式存储 865 if (!pTianwangFile || !pUrl || !pPage) { 866 return; 867 } 868 869 file_arg arg; 870 arg.pUrl = pUrl; 871 arg.pPage = pPage; 872 873 // each thread writes itself, so dnnot need mutex 874 pTianwangFile->Write((void*) &arg); 875 } 876 877 /************************************************************************************** 878 * Function name: SaveLink4SERawData 879 * Input argv: 880 * -- pLink4SEFile: link4SE file handle 881 * -- pUrl: url 882 * -- pPage: web page 883 * Output argv: 884 * -- 885 * Return: 886 * Function Description: save raw page data as tianwang file 887 **************************************************************************************/ 888 void CCrawl::SaveLink4SERawData(CLink4SEFile *pLink4SEFile, CUrl *pUrl, 889 CPage *pPage) { //将抓取的网页从中提取超链接信息建立网页结构库 890 if (!pLink4SEFile || !pUrl || !pPage) { 891 return; 892 } 893 894 file_arg arg; 895 arg.pUrl = pUrl; 896 arg.pPage = pPage; 897 898 // each thread writes itself, so dnnot need mutex 899 pLink4SEFile->Write((void*) &arg); 900 } 901 902 /************************************************************************************** 903 * Function name: SaveIsamRawData 904 * Input argv: 905 * -- pUrl: url 906 * -- pPage: web page 907 * Output argv: 908 * -- 909 * Return: 910 * Function Description: save raw page data as ISAM file 911 **************************************************************************************/ 912 void CCrawl::SaveIsamRawData(CUrl *pUrl, CPage *pPage) { 913 if (!pUrl || !pPage) { 914 return; 915 } 916 917 file_arg arg; 918 arg.pUrl = pUrl; 919 arg.pPage = pPage; 920 921 pthread_mutex_lock(&mutexIsamFile); 922 923 m_isamFile.Write((void *) &arg); 924 925 pthread_mutex_unlock(&mutexIsamFile); 926 } 927 928 /************************************************************************************** 929 * Function name: SaveVisitedUrl 930 * Input argv: 931 * -- url: url 932 * Output argv: 933 * -- 934 * Return: 935 * Function Description: save raw the Visited Url 936 **************************************************************************************/ 937 void CCrawl::SaveVisitedUrl(string url) { 938 if (m_ofsVisitedUrlFile) { 939 pthread_mutex_lock(&mutexVisitedUrlFile); 940 941 m_ofsVisitedUrlFile << url << endl; 942 943 pthread_mutex_unlock(&mutexVisitedUrlFile); 944 } 945 } 946 947 void CCrawl::SaveUnreachHost(string host) { 948 CMD5 iMD5; 949 iMD5.GenerateMD5((unsigned char*) host.c_str(), host.size()); 950 string strDigest = iMD5.ToString(); 951 if (setUnreachHostMD5.find(strDigest) == setUnreachHostMD5.end()) { 952 pthread_mutex_lock(&mutexUnreachHost); 953 954 setUnreachHostMD5.insert(strDigest); 955 if (m_ofsUnreachHostFile) { 956 m_ofsUnreachHostFile << host << endl; 957 } 958 959 pthread_mutex_unlock(&mutexUnreachHost); 960 } 961 } 962 963 void CCrawl::SaveLink4SE(CPage *iPage) { 964 if (m_ofsLink4SEFile && iPage->m_nRefLink4SENum > 0) { 965 pthread_mutex_lock(&mutexLink4SEFile); 966 967 m_ofsLink4SEFile << "root_url: " << iPage->m_sUrl << endl; 968 m_ofsLink4SEFile << "charset: " << iPage->m_sCharset << endl; 969 m_ofsLink4SEFile << "number: " << iPage->m_nRefLink4SENum << endl; 970 m_ofsLink4SEFile << "link_anchortext: " << endl; 971 972 map<string, string>::iterator it4SE = iPage->m_mapLink4SE.begin(); 973 for (; it4SE != iPage->m_mapLink4SE.end(); ++it4SE) { 974 975 m_ofsLink4SEFile << (*it4SE).first << '\t' << (*it4SE).second 976 << endl; 977 ; 978 979 } 980 981 pthread_mutex_unlock(&mutexLink4SEFile); 982 } 983 } 984 985 bool CCrawl::SaveLink4SE031121(void *arg) { 986 if (!arg || !m_ofsLink4SEFile) 987 return false; 988 989 //pthread_mutex_lock(&mutexLink4SEFile); 990 991 if (vsParsedLinks.size() == 0) 992 return false; 993 994 file_arg *pFile = (file_arg *) arg; 995 996 CUrl *iUrl = pFile->pUrl; 997 CPage *iPage = pFile->pPage; 998 999 char strDownloadTime[128]; 1000 time_t tDate; 1001 1002 memset(strDownloadTime, 0, 128); 1003 time(&tDate); 1004 strftime(strDownloadTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate)); 1005 1006 string links; 1007 vector<string>::iterator it = vsParsedLinks.begin(); 1008 for (; it != vsParsedLinks.end(); ++it) { 1009 links = links + *it + "\n"; 1010 } 1011 1012 m_ofsLink4SEFile << "version: 1.0\n"; 1013 if (iPage->m_sLocation.size() == 0) { 1014 m_ofsLink4SEFile << "url: " << iPage->m_sUrl; 1015 } else { 1016 m_ofsLink4SEFile << "url: " << iPage->m_sLocation; 1017 m_ofsLink4SEFile << "\norigin: " << iUrl->m_sUrl; 1018 } 1019 1020 m_ofsLink4SEFile << "\ndate: " << strDownloadTime; 1021 1022 if (mapCacheHostLookup.find(iUrl->m_sHost) == mapCacheHostLookup.end()) { 1023 m_ofsLink4SEFile << "\nip: " << iUrl->m_sHost; 1024 } else { 1025 m_ofsLink4SEFile << "\nip: " 1026 << (*(mapCacheHostLookup.find(iUrl->m_sHost))).second; 1027 } 1028 1029 m_ofsLink4SEFile << "\noutdegree: " << vsParsedLinks.size(); 1030 m_ofsLink4SEFile << "\nlength: " << iPage->m_nLenHeader + links.size() + 1 1031 << "\n\n" << iPage->m_sHeader << "\n"; 1032 m_ofsLink4SEFile << links; 1033 m_ofsLink4SEFile << endl; 1034 1035 vsParsedLinks.clear(); 1036 //pthread_mutex_unlock(&mutexLink4SEFile); 1037 1038 return true; 1039 } 1040 1041 // not well 1042 void CCrawl::SaveLink4History(CPage *iPage) {//保存为历史网页存档准备的超链接信息 1043 if (m_ofsLink4HistoryFile && iPage->m_nRefLink4HistoryNum > 0) { 1044 pthread_mutex_lock(&mutexLink4HistoryFile); 1045 1046 //m_ofsLink4HistoryFile << "root_url: " << iPage->m_sUrl << endl; 1047 //m_ofsLink4HistoryFile << "charset: " << iPage->m_sCharset << endl; 1048 //m_ofsLink4HistoryFile << "number: " << iPage->m_nRefLink4HistoryNum << endl; 1049 //m_ofsLink4HistoryFile << "link: " << endl; 1050 1051 vector<string>::iterator it4History = iPage->m_vecLink4History.begin(); 1052 for (; it4History != iPage->m_vecLink4History.end(); ++it4History) { 1053 string s = *it4History; 1054 m_ofsLink4HistoryFile << s << endl; 1055 } 1056 1057 pthread_mutex_unlock(&mutexLink4HistoryFile); 1058 } 1059 } 1060 1061 /************************************************************************************** 1062 * Function name: SaveVisitedUrlMd5 1063 * Input argv: 1064 * -- md5: page md5 value 1065 * Output argv: 1066 * -- 1067 * Return: 1068 * Function Description: save the visited url Md5 1069 **************************************************************************************/ 1070 void CCrawl::SaveVisitedUrlMD5(string md5) { 1071 if (m_ofsVisitedUrlMD5File) { 1072 m_ofsVisitedUrlMD5File << md5 << endl; 1073 } 1074 } 1075 1076 /************************************************************************************** 1077 * Function name: SaveVisitedPageMd5 1078 * Input argv: 1079 * -- md5: page md5 value 1080 * Output argv: 1081 * -- 1082 * Return: 1083 * Function Description: save the visited url Md5 1084 **************************************************************************************/ 1085 void CCrawl::SaveVisitedPageMD5(string md5) { 1086 if (m_ofsVisitedPageMD5File) { 1087 m_ofsVisitedPageMD5File << md5 << endl; 1088 } 1089 } 1090 1091 /************************************************************************************** 1092 * Function name: OpenFileForOutput 1093 * Input argv: 1094 * -- 1095 * Output argv: 1096 * -- 1097 * Return: 1098 * Function Description: Open the files for output 1099 **************************************************************************************/ 1100 void CCrawl::OpenFilesForOutput() { 1101 // open isam file for output 1102 m_isamFile.Open(DATA_FILE_NAME, INDEX_FILE_NAME); 1103 1104 // open visited.url file for output 1105 m_ofsVisitedUrlFile.open(m_sOutputFileName.c_str(), 1106 ios::out | ios::app | ios::binary); 1107 if (!m_ofsVisitedUrlFile) { 1108 cerr << "cannot open " << VISITED_FILE << " for output\n" << endl; 1109 } 1110 1111 // open link4SE.url file for output 1112 m_ofsLink4SEFile.open(LINK4SE_FILE.c_str(), 1113 ios::out | ios::app | ios::binary); 1114 if (!m_ofsLink4SEFile) { 1115 cerr << "cannot open " << LINK4SE_FILE << " for output\n" << endl; 1116 } 1117 1118 // open link4History.url file for output 1119 m_ofsLink4HistoryFile.open(LINK4History_FILE.c_str(), 1120 ios::out | ios::app | ios::binary); 1121 if (!m_ofsLink4HistoryFile) { 1122 cerr << "cannot open " << LINK4History_FILE << " for output\n" << endl; 1123 } 1124 1125 // open unreach host file for output 1126 m_ofsUnreachHostFile.open(UNREACH_HOST_FILE.c_str(), 1127 ios::out | ios::app | ios::binary); 1128 if (!m_ofsUnreachHostFile) { 1129 cerr << "cannot open " << UNREACH_HOST_FILE << " for output\n" << endl; 1130 } 1131 1132 // open visited url md5 file for output 1133 m_ofsVisitedUrlMD5File.open(URL_MD5_FILE.c_str(), 1134 ios::out | ios::app | ios::binary); 1135 if (!m_ofsVisitedUrlMD5File) { 1136 cerr << "cannot open " << URL_MD5_FILE << " for output\n" << endl; 1137 } 1138 1139 // open visited page md5 file for output 1140 m_ofsVisitedPageMD5File.open(PAGE_MD5_FILE.c_str(), 1141 ios::out | ios::app | ios::binary); 1142 if (!m_ofsVisitedPageMD5File) { 1143 cerr << "cannot open " << PAGE_MD5_FILE << " for output\n" << endl; 1144 } 1145 } 1146 1147 /*************************************************************************************** 1148 * Function name: DoCrawl 1149 * Input argv: 1150 * -- 1151 * Output argv: 1152 * -- 1153 * Return: 1154 * Function Description: the main function for crawl 1155 * Be careful: 1156 ***************************************************************************************/ 1157 void CCrawl::DoCrawl() {//CCrawl类中的总控函数 1158 /* set the signal function */ 1159 signal(SIGTERM, SigTerm); 1160 signal(SIGKILL, SigTerm); 1161 signal(SIGINT, SigTerm); 1162 signal(SIGPIPE, SIG_IGN); 1163 signal(SIGCHLD, SIG_IGN); 1164 1165 // output the begin time 1166 char strTime[128]; 1167 time_t tDate; 1168 1169 memset(strTime, 0, 128); 1170 time(&tDate); 1171 strftime(strTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate)); 1172 cout << "\n\nBegin at: " << strTime << "\n\n"; 1173 1174 // get the other info from file 1175 GetVisitedUrlMD5(); 1176 GetVisitedPageMD5(); 1177 1178 GetIpBlock(); 1179 1180 GetUnreachHostMD5(); 1181 1182 // open the seed url file 1183 ifstream ifsSeed(m_sInputFileName.c_str()); 1184 if (!ifsSeed) { 1185 cerr << "Cannot open " << m_sInputFileName << " for input\n"; 1186 return; 1187 } 1188 1189 // open the files for output 1190 OpenFilesForOutput(); 1191 1192 // Create thread ID structures. 1193 pthread_t *tids = (pthread_t*) malloc(NUM_WORKERS * sizeof(pthread_t)); 1194 if (tids == NULL) { 1195 cerr << "malloc error" << endl; 1196 } 1197 1198 for (unsigned int i = 0; i < NUM_WORKERS; i++) { 1199 if (pthread_create(&tids[i], NULL, start, this)) 1200 cerr << "create threads error" << endl; 1201 } 1202 1203 string strUrl; 1204 CPage iCPage; 1205 while (getline(ifsSeed, strUrl)) { 1206 string::size_type idx; 1207 1208 if (strUrl[0] == '\0' || strUrl[0] == '#' || strUrl[0] == '\n') { 1209 continue; 1210 } 1211 1212 idx = strUrl.find('\t'); 1213 if (idx != string::npos) { 1214 strUrl = strUrl.substr(0, idx); 1215 } 1216 1217 //idx = strUrl.find("http"); 1218 idx = CStrFun::FindCase(strUrl, "http"); 1219 if (idx == string::npos) { 1220 //continue; 1221 idx = strUrl.find('/'); 1222 if (idx == string::npos) { 1223 strUrl = "http://" + strUrl + "/"; 1224 } else { 1225 strUrl = "http://" + strUrl; 1226 } 1227 } 1228 1229 //if( strUrl.length() < 8 ) continue; 1230 1231 if (iCPage.IsFilterLink(strUrl)) 1232 continue; 1233 AddUrl(strUrl.c_str()); 1234 } 1235 1236 // Get the unvisited URL 1237 ifstream ifsUnvisitedUrl(UNVISITED_FILE.c_str()); 1238 if (ifsUnvisitedUrl) { 1239 while (getline(ifsUnvisitedUrl, strUrl)) { 1240 string::size_type idx; 1241 1242 if (strUrl[0] == '\0' || strUrl[0] == '#' || strUrl[0] == '\n') { 1243 continue; 1244 } 1245 1246 idx = strUrl.find('\t'); 1247 if (idx != string::npos) { 1248 strUrl = strUrl.substr(0, idx); 1249 } 1250 1251 // filter invalid urls 1252 if (iCPage.IsFilterLink(strUrl)) 1253 continue; 1254 1255 AddUrl(strUrl.c_str()); 1256 } 1257 } else { 1258 //cerr << "Cannot open " << UNVISITED_FILE << " for input\n"; 1259 } 1260 1261 // sleep(30); 1262 b_fOver = true; 1263 cout << "finished to get all unvisited urls." << endl; 1264 1265 // Wait for the threads. 1266 for (unsigned int i = 0; i < NUM_WORKERS; ++i) { 1267 (void) pthread_join(tids[i], NULL); 1268 } 1269 1270 cout << "closed " << NUM_WORKERS << " threads." << endl; 1271 1272 SaveUnvisitedUrl(); 1273 SaveReplicas("repli"); 1274 1275 memset(strTime, 0, 128); 1276 time(&tDate); 1277 strftime(strTime, 128, "%a, %d %b %Y %H:%M:%S GMT", gmtime(&tDate)); 1278 cout << "\n\nEnd at: " << strTime << "\n\n"; 1279 } 1280 1281 /***************************************************************** 1282 ** Function name: AddUrl 1283 ** Input argv: 1284 ** -- 1285 ** Output argv: 1286 ** -- 1287 ** Return: 1288 ** Function Description: Add a parsed url into the collection 1289 ** Version: 1.0 1290 ** Be careful: An important function!!! 1291 *****************************************************************/ 1292 void CCrawl::AddUrl(const char * url) { 1293 string strUrl = url; 1294 if (strUrl.empty() || strUrl.size() < 8) { //invalid url 1295 cout << "!so small!" << strUrl << endl; 1296 return; 1297 } 1298 1299 CPage iCPage; 1300 if (iCPage.NormalizeUrl(strUrl) == false) { 1301 // cout << "!normalize fail!" << strUrl << endl; 1302 return; 1303 } 1304 1305 CUrl iUrl; 1306 1307 // for ImgSE, comment the paragraph 1308 // if image/xxx url, store it to link4History.url 1309 // begin 1310 if (iUrl.IsImageUrl(strUrl)) { 1311 if (m_ofsLink4HistoryFile) { 1312 pthread_mutex_lock(&mutexLink4HistoryFile); 1313 m_ofsLink4HistoryFile << strUrl << endl; 1314 ; 1315 pthread_mutex_unlock(&mutexLink4HistoryFile); 1316 } 1317 return; 1318 } 1319 // end 1320 1321 if (iUrl.ParseUrlEx(strUrl) == false) { 1322 cout << "ParseUrlEx error in AddUrl(): " << strUrl << endl; 1323 return; 1324 } 1325 1326 // if it is an invalid host, discard it 1327 if (iUrl.IsValidHost(iUrl.m_sHost.c_str()) == false) { 1328 cout << "!invalid host: " << iUrl.m_sHost << endl; 1329 return; 1330 } 1331 1332 // filter foreign hosts 1333 if (iUrl.IsForeignHost(iUrl.m_sHost)) { 1334 cout << "!foreign hosts: " << iUrl.m_sHost << endl; 1335 return; 1336 } 1337 1338 // if it is a block ip, discard it 1339 // this work is left in the CreatSocket() 1340 // because the work of getting ip is inevitable in the CreatSocket function 1341 // and this work is expensive 1342 // if it is an unreach host, discard it 1343 // here we only deal with numbers-and-dots notations 1344 unsigned long inaddr = 0; 1345 char *ip = NULL; 1346 1347 inaddr = (unsigned long) inet_addr(iUrl.m_sHost.c_str()); 1348 if (inaddr != INADDR_NONE) { // host is just ip 1349 //pthread_mutex_lock(&mutexMemory); 1350 ip = new char[iUrl.m_sHost.size() + 1]; 1351 //pthread_mutex_unlock(&mutexMemory); 1352 memset(ip, 0, iUrl.m_sHost.size() + 1); 1353 memcpy(ip, iUrl.m_sHost.c_str(), iUrl.m_sHost.size()); 1354 1355 if (!iUrl.IsValidIp(ip)) { // out of ip block 1356 //pthread_mutex_lock(&mutexMemory); 1357 delete[] ip; 1358 ip = NULL; 1359 //pthread_mutex_unlock(&mutexMemory); 1360 //cout << "!unreach hosts: " << iUrl.m_sHost << endl; 1361 return; 1362 } 1363 //pthread_mutex_lock(&mutexMemory); 1364 delete[] ip; 1365 ip = NULL; 1366 //pthread_mutex_unlock(&mutexMemory); 1367 } 1368 1369 CStrFun::Str2Lower(iUrl.m_sHost, iUrl.m_sHost.size()); 1370 CMD5 iMD5; 1371 iMD5.GenerateMD5((unsigned char*) iUrl.m_sHost.c_str(), 1372 iUrl.m_sHost.size()); 1373 string strDigest = iMD5.ToString(); 1374 if (setUnreachHostMD5.find(strDigest) != setUnreachHostMD5.end()) { 1375 //cout << "!unreach host! " << iUrl.m_sHost << endl; 1376 return; 1377 } 1378 1379 // if crawled, discard it 1380 iMD5.GenerateMD5((unsigned char*) strUrl.c_str(), strUrl.size()); 1381 strDigest = iMD5.ToString(); 1382 1383 if (setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end()) { 1384 // cout << "!visited! " << strUrl << endl; 1385 return; 1386 } 1387 1388 // if already in the collection, discard it 1389 if (setUnvisitedUrlMD5.find(strDigest) != setUnvisitedUrlMD5.end()) { 1390 // cout << "!in collection! " << strUrl << endl; 1391 return; 1392 } else { 1393 pthread_mutex_lock(&mutexUnvisitedUrlMD5); 1394 setUnvisitedUrlMD5.insert(strDigest); 1395 pthread_mutex_unlock(&mutexUnvisitedUrlMD5); 1396 } 1397 1398 // add 1399 // make sure limited threads crawling on a site 1400 int cnt = 0; 1401 for (;;) { 1402 //if( mmapUrls.count(iUrl.m_sHost) < NUM_WORKERS_ON_A_SITE ){ 1403 1404 if (1) { 1405 //pthread_mutex_lock(&mutexVisitedUrlMD5); 1406 1407 // if crawled, discard it :) double secure 1408 //if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) { 1409 //cout << "!v! " << strUrl << endl; 1410 //pthread_mutex_unlock(&mutexVisitedUrlMD5); 1411 //return; 1412 //} else { 1413 1414 pthread_mutex_lock(&mutexVisitedUrlMD5); 1415 mmapUrls.insert(mvalType(iUrl.m_sHost, strUrl)); 1416 pthread_mutex_unlock(&mutexVisitedUrlMD5); 1417 break; 1418 //} 1419 } else { 1420 cnt++; 1421 if (cnt % 100 == 0) { 1422 cout << "~"; 1423 //cnt = 0; 1424 } 1425 1426 // If we have waiting so long, we may remove it 1427 if (cnt == 50000) { 1428 cout << "romove it!!!!!!!!!!!!!!!!!!!" << endl; 1429 break; 1430 } 1431 usleep(4000); 1432 } 1433 1434 } 1435 1436 }