zoukankan      html  css  js  c++  java
  • CPage

      1 #ifndef _Page_H_030728_
      2 #define _Page_H_030728_
      3 
      4 #include <string>
      5 #include <map>
      6 #include <vector>
      7 #include <list>
      8 #include "Url.h"
      9 #include "list.h"
     10 #include "uri.h"
     11 #include "hlink.h"
     12 
     13 
     14 //large enough to hold sina's 437 links 
     15 
     16 const int ANCHOR_TEXT_LEN       = 256;
     17 const int MAX_URL_REFERENCES    = 1000;
     18 const int URL_REFERENCE_LEN     = (URL_LEN+ANCHOR_TEXT_LEN)*MAX_URL_REFERENCES*1/2 ;
     19 const int MAX_TAG_NUMBERS    = 10000;
     20 
     21 using namespace std;
     22 
     23 // plain text or other
     24 enum page_type {
     25     PLAIN_TEXT,
     26     OTHER    
     27 };
     28 
     29 struct RefLink4SE // <href src...>, <area src...>
     30 {
     31     char *link;
     32     char *anchor_text;
     33     string strCharset;
     34 };
     35 
     36 struct RefLink4History    // <img src...>,<script src...>
     37 {
     38     char *link;
     39 };
     40 
     41 class CPage
     42 {
     43 public:
     44     // url & location
     45     string m_sUrl;        //网页对应的URL字符串
     46 
     47     // header
     48     string m_sHeader;//网页头信息
     49     int m_nLenHeader;//网页头信息的长度
     50 
     51     int m_nStatusCode;//状态码
     52     int m_nContentLength;;//从网页头信息中提取的网页体的长度,一般不是很准
     53     string m_sLocation;//网页的转向信息,可以判断这个网页是否重定向
     54     bool m_bConnectionState;    //是否支持持续链接Keep-Alive为true否则为false
     55     string m_sContentEncoding;//网页体的编码
     56     string m_sContentType;//网页体的类型
     57     string m_sCharset;//网页体的字符集
     58     string m_sTransferEncoding;//网页体的传输编码方式
     59 
     60     // content
     61     string m_sContent;//网页体信息
     62     int m_nLenContent;//网页体信息的长度
     63     string m_sContentNoTags;
     64 
     65 
     66     // link, in a lash-up state
     67     string m_sContentLinkInfo;
     68     //从网页体中提取出包含超链接信息的标识,例如<img src="www.baidu.com"/> ,
     69     //<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>
     70 
     71     // links for SE, in a lash-up state
     72     string m_sLinkInfo4SE;
     73     //再从m_sContentLinkInfo提取出<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>标识信息
     74     int m_nLenLinkInfo4SE;;//m_sLinkInfo4SE的长度
     75 
     76     // links for history archiving, in a lash-up state
     77     string m_sLinkInfo4History;//再从m_sContentLinkInfo提取出<img src="www.baidu.com">标识信息
     78     int m_nLenLinkInfo4History;//m_sLinkInfo4History的长度
     79 
     80 
     81     // links for SE, in a good state
     82     RefLink4SE m_RefLink4SE[MAX_URL_REFERENCES];//保存URL信息<-->URL的描述信息[这里URL指的是为搜索准备的链接] 即每个网页最多能保存1000个链接
     83     int m_nRefLink4SENum;//上面数组的长度
     84 
     85     // links for history archiving, in a good state
     86     RefLink4History m_RefLink4History[MAX_URL_REFERENCES/2];//保存URL信息[这个URL指的是为历史网页存档准备的链接]
     87     int m_nRefLink4HistoryNum;//上面数组的长度
     88 
     89     //map<string,string,less<string> > m_mapLink4SE;
     90     map<string,string> m_mapLink4SE;//保存URL信息<-->URL的描述信息[这里URL指的是为搜索准备的链接]
     91                                     //-----当然了这个map容器的作用主要是删除一个网页中相同的URL
     92     vector<string > m_vecLink4History;//保存URL信息--当然了这个vector容器的作用主要是删除一个网页中相同的URL
     93 
     94     // page type
     95     enum page_type m_eType;//网页的类型
     96 
     97     // parsed url lists
     98     //list<string>    m_listLink4SE;
     99 
    100 public:
    101     CPage();
    102     CPage(string strUrl, string strLocation, char* header, char* body, int nLenBody);
    103     ~CPage();
    104 
    105     // parse header information from the header content
    106     void ParseHeaderInfo(string header);//解析网页头信息
    107 
    108     // parse hyperlinks from the page content
    109     bool ParseHyperLinks();//从网页中提取出链接信息
    110 
    111     bool NormalizeUrl(string& strUrl);//判断strUrl是不是正规的url
    112 
    113     bool IsFilterLink(string plink);//判断plink链接是不是要过滤掉
    114 
    115 private:
    116     // parse header information from the header content
    117     void GetStatusCode(string header);//得到状态码
    118     void GetContentLength(string header);//从网页头信息中提取的网页体的长度,一般不是很准
    119     void GetConnectionState(string header);//得到连接状态
    120     void GetLocation(string header);//得到重定向信息
    121     void GetCharset(string header);//得到字符集
    122     void GetContentEncoding(string header);//得到网页体编码
    123     void GetContentType(string header);//得到网页体类型
    124     void GetTransferEncoding(string header);//得到网页体的传输编码方式
    125 
    126     // parse hyperlinks from the web page
    127     bool GetContentLinkInfo();//从网页体中提取出包含超链接信息的标识,
    128                               //例如<img src="www.baidu.com"/> ,<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>
    129 
    130 
    131     bool GetLinkInfo4SE();//再从m_sContentLinkInfo提取出<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>标识信息
    132     bool GetLinkInfo4History();//再从m_sContentLinkInfo提取出<img src="www.baidu.com">标识信息
    133     bool FindRefLink4SE();//最终得到为搜索引擎准备的超链接
    134     bool FindRefLink4History();//最终得到为历史网页存档准备的超链接
    135 
    136 };
    137 
    138 #endif /* _Page_H_030728_ */
       1 /*Page handling
       2  */
       3 
       4 #include <iostream>
       5 #include <string>
       6 #include <cstring>
       7 #include <map>
       8 #include <vector>
       9 #include <iterator>
      10 #include "Url.h"
      11 #include "Page.h"
      12 #include "StrFun.h"
      13 
      14 
      15 //带参构造函数
      16 CPage::CPage()
      17 {
      18     //初始化成员变量
      19     m_nStatusCode = 0;
      20     m_nContentLength = 0;
      21     m_sLocation = "";
      22     m_bConnectionState = false;
      23     m_sContentEncoding = "";
      24     m_sContentType = "";
      25     m_sCharset = "";
      26     m_sTransferEncoding = "";
      27 
      28     m_sContentLinkInfo = "";
      29     m_sLinkInfo4SE = "";
      30     m_sLinkInfo4History = "";
      31 
      32     m_sContentNoTags = "";
      33     m_nRefLink4SENum = 0;
      34     m_nRefLink4HistoryNum = 0;
      35     m_eType = PLAIN_TEXT;
      36 
      37 
      38     //超链接信息以及超链接的描述信息初始化都为空
      39     for(int i=0; i< MAX_URL_REFERENCES; i++ ){
      40         m_RefLink4SE[i].link = NULL;
      41         m_RefLink4SE[i].anchor_text = NULL;
      42         m_RefLink4SE[i].strCharset = "";
      43 
      44         if(i < MAX_URL_REFERENCES/2){
      45             m_RefLink4History[i].link = NULL;
      46         }
      47     }
      48 
      49 }
      50 
      51 CPage::CPage( string strUrl, string strLocation, char* header, char* body, int nLenBody)
      52 {
      53     //assert( header != NULL );
      54     //assert( body != NULL );
      55     //assert( nLenBody > 0 );
      56 
      57     // CPage();
      58     m_nStatusCode = 0;
      59     m_nContentLength = 0;
      60     m_sLocation = "";
      61     m_bConnectionState = false;
      62     m_sContentEncoding = "";
      63     m_sContentType = "";
      64     m_sCharset = "";
      65     m_sTransferEncoding = "";
      66 
      67     m_sContentLinkInfo = "";
      68     m_sLinkInfo4SE = "";
      69     m_sLinkInfo4History = "";
      70 
      71     m_sContentNoTags = "";
      72     m_nRefLink4SENum = 0;
      73     m_nRefLink4HistoryNum = 0;
      74     m_eType = PLAIN_TEXT;
      75 
      76     //超链接信息以及超链接的描述信息初始化都为空
      77     for(int i=0; i< MAX_URL_REFERENCES; i++ ){
      78         m_RefLink4SE[i].link = NULL;
      79         m_RefLink4SE[i].anchor_text = NULL;
      80         m_RefLink4SE[i].strCharset = "";
      81 
      82         if(i < MAX_URL_REFERENCES/2){
      83             m_RefLink4History[i].link = NULL;
      84         }
      85     }
      86 
      87     //将构造函数传入的参数赋值给成员变量
      88     m_sUrl = strUrl;//网页对应的URL
      89     m_sLocation = strLocation;//网页重定向的URL,没有重定向则传入为空,否则传入重定向的URL信息
      90     m_sHeader = header;//网页的头信息
      91     m_nLenHeader = strlen(header);//网页头信息的长度
      92 
      93     m_sContent.assign(body, nLenBody);//网页体信息,用body所指向数组的前nLenBody个字符副本替换m_sContent
      94     m_nLenContent = nLenBody;//网页体信息的长度
      95 
      96 }
      97 
      98 CPage::~CPage()
      99 {
     100 }
     101 
     102 
     103 //解析网页头信息---调用8个私有的成员函数
     104 void CPage::ParseHeaderInfo(string strHeader)
     105 {
     106     GetStatusCode(strHeader);
     107     GetContentLength(strHeader);
     108     GetLocation(strHeader);
     109     GetConnectionState(strHeader);
     110 
     111     GetCharset(strHeader);
     112 
     113     GetContentEncoding(strHeader);
     114     GetContentType(strHeader);
     115     GetTransferEncoding(strHeader);
     116 }
     117 
     118 //得到状态码
     119 void CPage::GetStatusCode(string headerBuf)
     120 {
     121     //例如:
     122 
     123     //HTTP/1.0 200 OK     200就是状态码
     124     CStrFun::Str2Lower( headerBuf, headerBuf.length() );
     125 
     126     char *charIndex = strstr(headerBuf.c_str(), "http/");//在字符串headerBuf中查找第一出现"http/"的位置
     127     if (charIndex == NULL)
     128     {
     129         m_nStatusCode = -1;
     130         return;
     131     }
     132     //吃掉所有无关的字符
     133     while(*charIndex != ' '){
     134         charIndex++;
     135     }
     136     charIndex++;
     137     
     138     int ret = sscanf(charIndex, "%i", &m_nStatusCode);//格式化字符串输入
     139     if (ret != 1)  m_nStatusCode = -1;
     140 }
     141 
     142 
     143 
     144 //从网页头信息中提取的网页体的长度,一般不是很准
     145 void CPage::GetContentLength(string headerBuf)
     146 {
     147     //例如:
     148 
     149     //content-length: 21237     21237就是网页体的长度,这个属性值是服务器返回的,不一定正确
     150     CStrFun::Str2Lower( headerBuf, headerBuf.length() );
     151 
     152     char *charIndex = strstr(headerBuf.c_str(), "content-length");
     153     if (charIndex == NULL) return;
     154 
     155     while(*charIndex != ' '){
     156         charIndex++;
     157     }
     158     charIndex++;
     159     
     160     int ret = sscanf(charIndex, "%i", &m_nContentLength);
     161     if (ret != 1)  m_nContentLength = -1;
     162 }
     163 
     164 
     165 //得到重定向信息
     166 void CPage::GetLocation(string headerBuf)
     167 {
     168     string::size_type pre_idx,idx;
     169     const string delims("\r\n");
     170 
     171     string strBuf =  headerBuf;
     172     CStrFun::Str2Lower( headerBuf, headerBuf.length() );
     173 
     174     idx = headerBuf.find("location:");
     175     if (idx != string::npos)//若找到
     176     {
     177         pre_idx = idx + sizeof("location: ") -1;
     178         idx = headerBuf.find_first_of(delims, pre_idx );//查找换行符
     179         if (idx != string::npos)
     180         {
     181             //m_sLocation = headerBuf.substr(pre_idx, idx - pre_idx);
     182             m_sLocation = strBuf.substr(pre_idx, idx - pre_idx);
     183         }
     184     }
     185 }
     186 
     187 
     188 //得到网页字符集
     189 void CPage::GetCharset(string headerBuf)
     190 {
     191     string::size_type pre_idx,idx;
     192     const string delims(" \",;>");
     193 
     194     CStrFun::Str2Lower(headerBuf, headerBuf.size());
     195 
     196     idx = headerBuf.find("charset=");
     197     if( idx != string::npos) {
     198         m_sCharset = headerBuf.substr(idx + sizeof("charset=") -1);//保存从charset=开始的所有字符串
     199     }
     200 
     201     headerBuf = m_sContent;
     202     headerBuf = headerBuf.substr(0,2024) ;
     203     CStrFun::Str2Lower( headerBuf, headerBuf.length() );
     204     idx = headerBuf.find("charset=");
     205     if (idx != string::npos)//后边有可能有多余的信息
     206     {
     207         pre_idx = idx + sizeof("charset=") -1;
     208         idx = headerBuf.find_first_of(delims, pre_idx );
     209         if(idx != string::npos){
     210             m_sCharset = headerBuf.substr(pre_idx, idx - pre_idx);
     211         }
     212     }
     213 }
     214 
     215 
     216 //得到网页体编码
     217 void CPage::GetContentEncoding(string headerBuf)
     218 {
     219     string::size_type pre_idx,idx;
     220     const string delims("\r\n");
     221 
     222     CStrFun::Str2Lower( headerBuf, headerBuf.length() );
     223 
     224     idx = headerBuf.find("content-encoding:");
     225     if (idx != string::npos)
     226     {
     227         pre_idx = idx + sizeof("content-encoding: ") -1;
     228         idx = headerBuf.find_first_of(delims, pre_idx );
     229         if (idx != string::npos)
     230         {
     231             m_sContentEncoding = headerBuf.substr(pre_idx, idx - pre_idx);
     232         }
     233     }
     234 }
     235 
     236 //得到连接状态
     237 void CPage::GetConnectionState(string headerBuf)
     238 {
     239     string::size_type pre_idx,idx;
     240     const string delims(";\r\n");
     241 
     242     CStrFun::Str2Lower( headerBuf, headerBuf.length() );
     243 
     244     idx = headerBuf.find("connection:");
     245     if (idx != string::npos)
     246     {
     247         pre_idx = idx + sizeof("connection: ") -1;
     248         idx = headerBuf.find_first_of(delims, pre_idx );
     249         if (idx != string::npos)
     250         {
     251             string str = headerBuf.substr(pre_idx, idx - pre_idx);
     252             //cout << "Connection state: " << str << endl;
     253             //if (str == "close") m_bConnectionState = false;
     254             if (str == "keep-alive") m_bConnectionState = true;
     255         }
     256     }
     257 }
     258 
     259 //得到网页体类型
     260 void CPage::GetContentType(string headerBuf)
     261 {
     262     string::size_type pre_idx,idx;
     263     const string delims(";\r\n");
     264 
     265     CStrFun::Str2Lower( headerBuf, headerBuf.size() );
     266 
     267     idx = headerBuf.find("content-type:");
     268     if (idx != string::npos)
     269     {
     270         pre_idx = idx + sizeof("content-type: ") -1;
     271         idx = headerBuf.find_first_of(delims, pre_idx );
     272         if (idx != string::npos)
     273         {
     274             m_sContentType = headerBuf.substr(pre_idx, idx - pre_idx);
     275         }
     276     }
     277 }
     278 
     279 //得到网页体的传输编码方式
     280 void CPage::GetTransferEncoding(string headerBuf)
     281 {
     282     string::size_type pre_idx,idx;
     283     const string delims(";\r\n");
     284 
     285     CStrFun::Str2Lower( headerBuf, headerBuf.size() );
     286 
     287     idx = headerBuf.find("transfer-encoding:");
     288     if ( idx != string::npos)
     289     {
     290         pre_idx = idx + sizeof("transfer-encoding: ") -1;
     291         idx = headerBuf.find_first_of(delims, pre_idx );
     292         if(idx != string::npos)
     293         {
     294             m_sTransferEncoding = headerBuf.substr(pre_idx, idx - pre_idx);
     295         }
     296     }
     297 }
     298 
     299 /*
     300  * Filter spam links
     301  * If it is, return ture; otherwise false
     302  */
     303 //判断一个URL是不是应该过滤,要过滤返回true否则返回false
     304 bool CPage::IsFilterLink(string plink)
     305 {
     306     if( plink.empty() ) return true;
     307     if( plink.size() > URL_LEN ) return true;
     308 
     309     string link = plink, tmp;
     310     string::size_type idx = 0;
     311 
     312     
     313     CStrFun::Str2Lower( link, link.length() );//link字符串中的字母全部变成小写
     314 
     315     // find two times following symbols, return false
     316     tmp = link;
     317     idx = tmp.find("?");//URL中出现2个'?'字符要过滤
     318     if( idx != string::npos ){
     319         tmp = tmp.substr(idx+1);
     320         idx = tmp.find("?");
     321         if( idx != string::npos ) return true;
     322     }
     323 
     324     tmp = link;//先后出现'-'和'+'字符要过滤
     325     idx = tmp.find("-");
     326     if( idx != string::npos ){
     327         tmp = tmp.substr(idx+1);
     328         idx = tmp.find("+");
     329         if( idx != string::npos ) return true;
     330     }
     331 
     332     //出现2个'&'字符要过滤
     333     tmp = link;
     334     idx = tmp.find("&");
     335     if( idx != string::npos ){
     336         tmp = tmp.substr(idx+1);
     337         idx = tmp.find("&");
     338         if( idx != string::npos ) return true;
     339     }
     340 
     341     //出现2个"//"字符要过滤
     342     tmp = link;
     343     idx = tmp.find("//");
     344     if( idx != string::npos ){
     345         tmp = tmp.substr(idx+1);
     346         idx = tmp.find("//");
     347         if( idx != string::npos ) return true;
     348     }
     349 
     350     //出现2个"http"要过滤
     351     tmp = link;
     352     idx = tmp.find("http");
     353     if( idx != string::npos ){
     354         tmp = tmp.substr(idx+1);
     355         idx = tmp.find("http");
     356         if( idx != string::npos ) return true;
     357     }
     358 
     359     //出现2个"misc"要过滤
     360     tmp = link;
     361     idx = tmp.find("misc");
     362     if( idx != string::npos ){
     363         tmp = tmp.substr(idx+1);
     364         idx = tmp.find("misc");
     365         if( idx != string::npos ) return true;
     366     }
     367 
     368     //出现2个"ipb"要过滤
     369     tmp = link;
     370     idx = tmp.find("ipb");
     371     if( idx != string::npos ){
     372         tmp = tmp.substr(idx+1);
     373         idx = tmp.find("ipb");
     374         if( idx != string::npos ) return true;
     375     }
     376 
     377     const char *filter_str[]={
     378     "cgi-bin",    "htbin",    "linder",    "srs5",        "uin-cgi",  // robots.txt of http://www.expasy.org/
     379     "uhtbin",    "snapshot",    "=+",        "=-",        "script",
     380     "gate",        "search",    "clickfile",    "data/scop",    "names",
     381     "staff/",    "enter",    "user",        "mail",    "pst?",
     382     "find?",    "ccc?",        "fwd?",        "tcon?",    "&amp",
     383     "counter?",    "forum",    "cgisirsi",    "{",        "}",
     384     "proxy",    "login",    "00.pl?",    "sciserv.pl",    "sign.asp",
     385     "<",        ">",        "review.asp?",    "result.asp?",    "keyword",
     386     "\"",        "'",        "php?s=",    "error",    "showdate",
     387     "niceprot.pl?",    "volue.asp?id",    ".css",        ".asp?month",    "prot.pl?",
     388     "msg.asp",    "register.asp", "database",    "reg.asp",    "qry?u",
     389     "p?msg",    "tj_all.asp?page", ".plot.",    "comment.php",    "nicezyme.pl?",
     390     "entr",        "compute-map?", "view-pdb?",    "list.cgi?",    "lists.cgi?",
     391     "details.pl?",    "aligner?",    "raw.pl?",    "interface.pl?","memcp.php?",
     392     "member.php?",    "post.php?",    "thread.php",    "bbs/",        "/bbs"
     393     };
     394     int filter_str_num = 75;
     395 
     396     //说明找到了上述字符串要过滤
     397     for(int i=0; i<filter_str_num; i++){
     398         if( link.find(filter_str[i]) != string::npos)
     399         return true;
     400     }    
     401 
     402     return false;
     403 }
     404 
     405 /////////////////////////////
     406 // just for ImgSE
     407 // e.g: http://www.people.com.cn/GB/tupian/index.html
     408 //     http://news.xinhuanet.com/photo/
     409 //     http://photo.tom.com/
     410 /////////////////////////////
     411 // comment previous one and open this one
     412 
     413 /*
     414 bool CPage::IsFilterLink(string plink)
     415 {
     416     if( plink.empty() ) return true;
     417     if( plink.size() > URL_LEN ) return true;
     418 
     419     return false;
     420 
     421     string link = plink, tmp;
     422     string::size_type idx = 0;
     423 
     424     
     425     CStrFun::Str2Lower( link, link.length() );
     426 
     427     const char *filter_str[]={
     428         "tupian", "photo", "ttjstk"
     429         };
     430     int filter_str_num = 3;
     431 
     432     CStrFun::Str2Lower( link, link.length() );
     433 
     434     for(int i=0; i<filter_str_num; i++){
     435         if( link.find(filter_str[i]) != string::npos)
     436         return false;
     437     }    
     438 
     439     return true;
     440 }
     441 */
     442 
     443 
     444 /*****************************************************************
     445 ** Function name: ParseHyperLinks
     446 ** Input argv:
     447 **      --
     448 ** Output argv:
     449 **      --
     450 ** Return:
     451         true: success
     452         false: fail
     453 ** Function Description:  Parse hyperlinks from the web page
     454 ** Version: 1.0
     455 ** Be careful:
     456 *****************************************************************/
     457 bool CPage::ParseHyperLinks()
     458 {
     459     if( GetContentLinkInfo() == false ) return false;
     460 
     461     if( m_sContentLinkInfo.empty() ) return false;
     462 
     463     bool bFind4SE = false;
     464     bool bFind4History = false;
     465     if( GetLinkInfo4SE() ){
     466         if( FindRefLink4SE() ) bFind4SE = true;
     467     } 
     468 
     469     if( GetLinkInfo4History() ){
     470         if( FindRefLink4History() ) bFind4History = true;
     471     }
     472 
     473     //如果没有从网页中提取出为搜索引擎或者为历史网页存档准备的超链接则返回false
     474     if( !bFind4SE && !bFind4History ){
     475          return false;
     476     }
     477 
     478     //return   GetHref(m_sContentLinkInfo.c_str(), "href", m_listLink4SE);
     479 
     480     return true;
     481 }
     482 
     483 
     484 /*****************************************************************
     485 ** Function name: GetContentLinkInfo
     486 ** Input argv:
     487 **      --
     488 ** Output argv:
     489 **      --
     490 ** Return:
     491         true: success
     492         false: fail
     493 ** Function Description:  Parse hyperlinks from the web page
     494 ** Version: 1.0
     495 ** Be careful:
     496 *****************************************************************/
     497 
     498 //从网页体中提取出包含超链接信息的标识
     499 bool CPage::GetContentLinkInfo()
     500 {
     501     if( m_sContent.empty() ) return false;
     502     
     503     m_sContentLinkInfo = m_sContent;
     504 
     505     string& s = m_sContentLinkInfo; //引用调用
     506 
     507     // transform all separation into one space character
     508     //CStrFun::ReplaceStr(s, "\t", " ");
     509     //CStrFun::ReplaceStr(s, "\r", " ");
     510     //CStrFun::ReplaceStr(s, "\n", " ");
     511     const string delims(" \t\r\n");
     512     string::size_type idx=0, pre_idx;
     513     
     514     //找到所有的"\t\r\n"并将'\t'替换为' ' 如果是\t\t\r\n则删除一个\t
     515     while( (idx = s.find_first_of(delims, idx)) != string::npos )
     516     {
     517         pre_idx = idx;
     518         s.replace(idx,1,1,' ');
     519         idx++;
     520         
     521         while( (idx = s.find_first_of(delims, idx)) != string::npos )
     522         {
     523             if( idx-pre_idx == 1 ){
     524                 s.erase(idx, 1);
     525             } else {
     526                 break;
     527             }
     528         }
     529 
     530         idx--;
     531     }
     532 
     533     // transform all "<br>" into one space character
     534     //将s中<br>标记全部替换为空格
     535     CStrFun::ReplaceStr(s, "<br>", " ");
     536 
     537     if( s.size() < 20 ) return false;
     538 
     539     // Keep only <img ...>, <area ...>,<script ...> and <a href ...> tags.
     540     string::size_type idxHref=0,idxArea=0,idxImg=0;
     541     string dest;
     542 
     543     do{
     544         if( s.empty() ) break;
     545 
     546         idxHref = CStrFun::FindCase(s, "href");
     547         idxArea = CStrFun::FindCase(s, "<area");
     548         idxImg = CStrFun::FindCase(s, "<img");
     549 
     550         pre_idx = idxHref > idxArea? idxArea: idxHref;
     551         pre_idx = idxImg > pre_idx? pre_idx: idxImg;
     552         if( pre_idx == string::npos) break;
     553 
     554         s = s.substr(pre_idx);
     555         idx = s.find_first_of('<',1);
     556         if( idx != string::npos ){
     557             dest = dest + s.substr(0,idx);
     558         }else{
     559             break;
     560         }
     561 
     562         s = s.substr(idx);
     563         idxHref=0; idxArea=0; idxImg=0;
     564     }while(1);
     565 
     566     s = dest;
     567 
     568     
     569     /* erase all '\' character
     570      * too avoid the following situations:
     571      *      document.write("<A href=\"/~webg/refpaper/index.html\">t2</A>");
     572     */
     573     CStrFun::EraseStr(s, "\\");
     574 
     575     if( s.size() < 20 ) return false;
     576 
     577     return true;
     578 }
     579 
     580 /*****************************************************************
     581 ** Function name: GetLinkInfo4SE()
     582 ** Input argv:
     583 **      --  
     584 ** Output argv:
     585 **      --
     586 ** Return:
     587        true: success
     588        false: fail
     589 ** Function Description:  Get links for SE
     590 ** Version: 1.0
     591 ** Be careful:
     592 *****************************************************************/
     593 
     594 //再从m_sContentLinkInfo提取出为搜索引擎准备的超链接
     595 bool CPage::GetLinkInfo4SE()
     596 {
     597 
     598     if( m_sContentLinkInfo.empty() ) return false;
     599 
     600     m_sLinkInfo4SE = m_sContentLinkInfo;
     601     string& s = m_sLinkInfo4SE;
     602 
     603      // Keep only <area ...>,and <a href ...> tags.
     604     string::size_type idxHref=0,idxArea=0,
     605         idx,pre_idx;
     606     string dest;
     607 
     608 
     609 
     610 
     611 
     612 
     613 
     614 
     615     /*
     616 
     617     例如:上面的m_sContentLinkInfo=href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ<img src="http://www.google.com.hk">
     618 
     619     我们这里提取出href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ    过滤掉<img src="http://www.google.com.hk">
     620 
     621     因为<img src="http://www.google.com.hk">的超链接是为历史网页存档准备的超链接
     622 
     623     */
     624 
     625 
     626     do{
     627         if( s.empty() ) break;
     628 
     629         //idxHref = CStrFun::FindCase(s, "<a href");
     630         idxHref = CStrFun::FindCase(s, "href");
     631         idxArea = CStrFun::FindCase(s, "<area ");
     632 
     633         pre_idx = idxHref > idxArea? idxArea: idxHref;
     634         //pre_idx = idxHref;
     635         if( pre_idx == string::npos) break;//终止条件
     636 
     637         s = s.substr(pre_idx);
     638         idx = s.find_first_of('<',1);
     639 
     640         if( !(s.length() < 4) )
     641         {
     642             idxHref = CStrFun::FindCaseFrom(s, "href", 4);
     643             idx = idx > idxHref ? idxHref: idx;
     644         }
     645 
     646         if( idx != string::npos ){
     647             dest = dest + s.substr(0,idx);
     648         }else if (idx == string::npos && pre_idx != string::npos){
     649             dest = dest + s;
     650             break;
     651         }else{
     652             break;
     653         }
     654 
     655         s = s.substr(idx);
     656         idxHref=0; idxArea=0;
     657     }while(1);
     658         
     659     s = dest;//dest保存着过滤后的数据
     660     if( s.length() < 20 ) return false;
     661 
     662 
     663     // erase all '"' , '\'', "&nbsp;".
     664     CStrFun::EraseStr(s, "\"");
     665     CStrFun::EraseStr(s, "'");
     666     CStrFun::EraseStr(s, "&nbsp");
     667 
     668      // Keep URLs and anchor text.
     669 
     670     idxHref=0;
     671     const string delims( " #>");
     672     dest.clear();
     673 
     674 
     675 
     676     /*
     677 
     678     通过上面的提取我们得到href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ
     679 
     680     我们再次提取
     681 
     682     m_sLinkInfo4SE="http://www.baidu.com/">百度"http://www.qq.com/">QQ
     683 
     684     */
     685 
     686 
     687 
     688     do{
     689         if( s.empty() ) break;
     690         idxHref = CStrFun::FindCase(s, "href");
     691 
     692         if( idxHref == string::npos) break;
     693         pre_idx = idxHref;
     694 
     695         //####
     696         idx = s.find('=', idxHref);
     697         if( idx == string::npos ) break;
     698         s = s.substr(idx+1);
     699 
     700         while( s.length() > 0 && s[0] == ' ' ){
     701             s.erase(0,1);
     702         }
     703         if( s.length() == 0 ) break;
     704 
     705         idx = s.find_first_of(delims,1);
     706         //cout << endl << s.substr(0, idx) << endl;
     707         if( idx == string::npos ) break;
     708 
     709         dest += '"' + s.substr(0, idx);
     710 
     711         //cout << endl << dest << endl;
     712             
     713         idx = s.find('>');
     714         if( idx == string::npos ) break;
     715         dest += '>';
     716         s = s.substr(idx +1);
     717             
     718         idx = s.find('<');
     719 
     720         if( !s.empty() ){
     721             idxHref = CStrFun::FindCase(s, "href");
     722             idx = idx > idxHref ? idxHref: idx;
     723         }    
     724 
     725         if( idx == string::npos ){
     726             dest += s;
     727             break;
     728         }
     729 
     730 /*
     731         if( idx == idxHref ){
     732             dest += '"' + s.substr(0,idx);
     733         }else{
     734 */
     735             dest += s.substr(0,idx);
     736         //}
     737         //####
     738 
     739         idxHref=0;
     740     }while(1);
     741         
     742     // look for empty filenames.
     743     idx = 0;
     744     while( (idx = dest.find("\"\"",idx)) != string::npos ){
     745         dest.erase(idx, 1);
     746     }
     747 
     748     s = dest;
     749 
     750     return( s.length() < 20 ? false: true );
     751 
     752 }
     753                     
     754 /*****************************************************************
     755 ** Function name: GetLinkInfo4History()
     756 ** Input argv:
     757 **      --  
     758 ** Output argv:
     759 **      --
     760 ** Return:
     761        true: success
     762        false: fail
     763 ** Function Description:  Get links for history archiving
     764 ** Version: 1.0
     765 ** Be careful:
     766 *****************************************************************/
     767 bool CPage::GetLinkInfo4History()
     768 {
     769     /*
     770 
     771     例如:上面的m_sContentLinkInfo=href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ<img src="http://www.google.com.hk">
     772 
     773     我们这里提取出<img src="http://www.google.com.hk">   过滤掉href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ
     774 
     775     因为href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ的超链接是为搜索引擎准备的超链接
     776 
     777     */
     778 
     779 
     780 
     781 
     782 
     783 
     784     if( m_sContentLinkInfo.empty() ) return false;
     785 
     786     m_sLinkInfo4History = m_sContentLinkInfo;
     787     string& s = this->m_sLinkInfo4History;
     788 
     789      // Keep only <img ...> tags.
     790     string::size_type idxImg=0,
     791         idx,pre_idx;
     792     string dest;
     793 
     794     do{
     795         if( s.empty() ) break;
     796         idxImg = CStrFun::FindCase(s, "<img");
     797 
     798         pre_idx = idxImg;
     799         if( pre_idx == string::npos) break;
     800 
     801         s = s.substr(pre_idx);
     802         idx = s.find_first_of('<',1);
     803 
     804         if( idx != string::npos ){
     805             dest = dest + s.substr(0,idx);
     806         }else if (idx == string::npos && pre_idx != string::npos){
     807             dest = dest + s;
     808             break;
     809         }else{
     810             break;
     811         }
     812 
     813         s = s.substr(idx);
     814         idxImg=0;
     815     }while(1);
     816         
     817     s = dest;
     818     if( s.length() < 20 ) return false;
     819 
     820     // erase all '"'. '\'',"&nbsp;".
     821     CStrFun::EraseStr(s , "\"");
     822     CStrFun::EraseStr(s , "'");
     823     CStrFun::EraseStr(s , "&nbsp");
     824 
     825      // Keep URLs and anchor text.
     826 
     827     idxImg=0;
     828     string::size_type idxSrc = 0;
     829     const string delims( " #>");
     830     dest.clear();
     831 
     832 
     833     /*
     834 
     835 通过上面的提取我们得到<img src="http://www.google.com.hk">
     836 
     837     我们再次提取
     838 
     839     m_sLinkInfo4History="http://www.google.com.hk>
     840 
     841 */
     842 
     843     do{
     844         if( s.empty() ) break;
     845         idxImg = CStrFun::FindCase(s, "img");
     846 
     847         if( idxImg == string::npos) break;
     848         pre_idx = idxImg;
     849 
     850         s = s.substr(idxImg+3);        // skip "img"
     851 
     852         //####
     853         idx = s.find('>', idxImg);
     854         if( idxImg == string::npos) break;
     855         if( s.empty() ) break;
     856         idxSrc = CStrFun::FindCase(s, "src");
     857         if( idxSrc > idxImg ) continue;
     858         s = s.substr(idxSrc);
     859 
     860         idx = s.find('=', idxImg);
     861         if( idx == string::npos ) break;
     862         s = s.substr(idx+1);
     863 
     864         while( s.length() > 0 && s[0] == ' ' ){
     865             s.erase(0,1);
     866         }
     867         if( s.length() == 0 ) break;
     868 
     869         idx = s.find_first_of(delims,1);
     870         if( idx == string::npos ) break;
     871 
     872         if( s.at(0) == '"'){
     873             dest += s.substr(0, idx);
     874         }else{
     875             dest += '"' + s.substr(0, idx);
     876         }
     877             
     878         idx = s.find('>');
     879         if( idx == string::npos ) break;
     880         dest += '>';
     881         s = s.substr(idx +1);
     882             
     883         idx = s.find('<');
     884         if( idx == string::npos ){
     885             dest += s;
     886             break;
     887         }
     888         dest += s.substr(0,idx);
     889         //####
     890 
     891         idxImg=0;
     892     }while(1);
     893         
     894 
     895     // look for empty filenames.
     896     idx = 0;
     897     while( (idx = dest.find("\"\"",idx)) != string::npos ){
     898         dest.erase(idx, 1);
     899     }
     900 
     901     s = dest;
     902 
     903     return( s.length() < 20 ? false: true );
     904 
     905 }
     906 
     907 
     908 
     909 
     910 //判断strUrl是不是正规的url
     911 bool CPage::NormalizeUrl(string& strUrl)
     912 {
     913     string::size_type idx;
     914 
     915 
     916     //URL没有htp://协议名我们这里认为strUrl不是正规的URL
     917     if( CStrFun::FindCase(strUrl, "http://") == string::npos ) return false;
     918 
     919     // convert "http://e.pku.cn" to "http://e.pku.cn/"
     920     //将http://www.baidu.com转化为http://www.baidu.com/
     921     idx = strUrl.rfind('/');
     922     if( idx < 8 ) {
     923         strUrl = strUrl + "/";
     924         return true;
     925     }
     926 
     927     //将"/./"-->"/"
     928     while( (idx=strUrl.find("/./")) != string::npos ){
     929         if( idx != string::npos ) strUrl.erase(idx,2);
     930     }
     931 
     932     //将"xxx/x/../yyy"-->xxx/yyy
     933     while( (idx = strUrl.find("/../")) != string::npos ){
     934         string strPre,strSuf;
     935 
     936         strPre = strUrl.substr(0, idx);
     937 
     938         if( strUrl.length() > idx+4 )
     939             strSuf = strUrl.substr(idx+4);
     940 
     941         idx = strPre.rfind("/");
     942         if( idx != string::npos)
     943             strPre = strPre.substr(0,idx+1);
     944         if( strPre.length() < 10 ) return false;
     945 
     946         strUrl = strPre + strSuf;
     947     }
     948 
     949     if( CStrFun::FindCase(strUrl, "http://") != 0 ) return false;
     950 
     951     return true;
     952 }
     953 
     954 
     955 
     956 
     957 
     958 
     959 
     960 /*最终得到为搜索引擎准备的超链接
     961 
     962   并将相对路径的URL和绝对路径的URL分别处理,同时,我们发现从一个网页中提取的超链接可以是相同的,这个时候
     963 
     964   我们必须去重,这个函数用map容器很好的做到了这一点
     965 
     966   还有一些URL不是正规的URL也要过滤
     967 
     968   还有一些URL是必要过滤也要过滤--通过IsFilterLink(string strUrl)实现
     969 
     970  */
     971 bool CPage::FindRefLink4SE()
     972 {
     973     if( m_sLinkInfo4SE.empty() ) return false;
     974 
     975     char *buffer = (char*)m_sLinkInfo4SE.c_str();
     976     int urlnum=0,len;
     977     char *ptr ;
     978 
     979     static char buf[URL_REFERENCE_LEN];
     980 
     981     memset(buf, 0, URL_REFERENCE_LEN);
     982     len = strlen(buffer);
     983     if( len < 8 ) return false;
     984 
     985     len = len < URL_REFERENCE_LEN -1 ? len : URL_REFERENCE_LEN - 1;//len记录相对较小的值
     986     strncpy( buf, buffer, len);
     987 
     988 /*first
     989  *------>
     990  */
     991 
     992 
     993     /*
     994 
     995     例如:m_sLinkInfo4SE="http://www.baidu.com/">百度"http://www.qq.com/">QQ
     996 
     997     我们这里提取为
     998 
     999     http://www.baidu.com  百度
    1000 
    1001     http://www.qqq.com    QQ
    1002 
    1003     */
    1004     ptr = buf;
    1005     while( ptr - buf < len  && *ptr )
    1006     {
    1007         while( *ptr == '"' && *ptr) ptr++;
    1008         if ( !*ptr ) break;
    1009         this->m_RefLink4SE[ urlnum].link = ptr;//每个网页里最多有1000个链接
    1010         while( *ptr && *ptr != '>')
    1011         {
    1012             //在遇到'>'之前,出现了' '字符,我们必须将' '字符赋值为'\0'说明URL提取完了,因为URL不可能出现' '字符
    1013             if(*ptr == ' ') *ptr = '\0';
    1014             //例如: "http://www.baidu.com/" height=100 width=150>百度   出现空格说明还有其他的属性值
    1015             ptr++;
    1016         }
    1017 
    1018         if ( !*ptr ){
    1019             urlnum++;
    1020             break;
    1021         }
    1022         if ( *ptr == '>' )
    1023         {
    1024             *ptr++='\0';
    1025             if( !*ptr )
    1026             {
    1027                 urlnum++;
    1028                 break;
    1029             }
    1030             
    1031             if( *ptr == '"' )
    1032             {
    1033                 this->m_RefLink4SE[urlnum].anchor_text = NULL;
    1034             }
    1035             else
    1036             {
    1037                 this->m_RefLink4SE[urlnum].anchor_text = ptr;
    1038                 while( *ptr && *ptr != '"') ptr++;
    1039                 if (!*ptr)
    1040                 {
    1041                     urlnum++;
    1042                     break;
    1043                 }
    1044                 if ( *ptr == '"') *ptr='\0';
    1045             }
    1046 
    1047         }
    1048         
    1049         //cout << endl << this->m_RefLink4SE[ urlnum].link << '\t';
    1050         //cout << this->m_RefLink4SE[ urlnum].anchor_text << endl;
    1051 
    1052         ptr++;
    1053         urlnum++;
    1054         if ( urlnum == MAX_URL_REFERENCES) break; //达到最多的url数目
    1055     }
    1056     //cout << endl << this->m_RefLink4SE[ urlnum].link << endl;
    1057     //cout << this->m_RefLink4SE[ urlnum].anchor_text << endl;
    1058 
    1059     this->m_nRefLink4SENum = urlnum;
    1060 
    1061 /*second
    1062  *------>
    1063  */
    1064     //typedef map<string,string,less<string> >::value_type valType;
    1065     typedef map<string,string>::value_type valType;
    1066 
    1067     m_mapLink4SE.clear();
    1068 
    1069     //string strRootUrl= m_sUrl;
    1070     CUrl iUrl;
    1071     if( iUrl.ParseUrlEx(m_sUrl) == false )
    1072     {
    1073         cout << "ParseUrlEx error in FindRefLink4SE(): " << m_sUrl << endl;
    1074         return false;
    1075     }
    1076     
    1077     for(int i=0; i<m_nRefLink4SENum; i++)
    1078     {
    1079 
    1080         string str;
    1081         string::size_type idx;
    1082         const string delims(" #");
    1083 
    1084         str = m_RefLink4SE[i].link;
    1085         idx = str.find_first_of(delims, 0 );
    1086         if( idx != string::npos )//如果找到标志
    1087         {
    1088             str = str.substr(0, idx);//只取#前边的url
    1089         }
    1090         if( str.size() == 0 || str.size() > URL_LEN - 1 || str.size() < 4 ) 
    1091             continue;
    1092 
    1093 
    1094         string::size_type idx1;
    1095         idx1 = CStrFun::FindCase(str, "http");
    1096         if( idx1 != 0  )//str有可能是相对路径
    1097         {
    1098             char c1 = m_sUrl.at(m_sUrl.length()-1);
    1099             char c2 = str.at(0);
    1100 
    1101             if( c2=='/' )//str一定是相对路径
    1102             {
    1103                 if( iUrl.m_nPort != 80 )//若是http
    1104                 {
    1105                     cout << iUrl.m_sHost << endl;
    1106                     cout << str << endl;
    1107                     //str = "http://" + iUrl.m_sHost + ":" + (const char*)(iUrl.m_nPort) + str;
    1108                     str = "http://" + iUrl.m_sHost + ":" + CStrFun::itos(iUrl.m_nPort) + str;
    1109                 } 
    1110                 else 
    1111                 {
    1112                     str = "http://" + iUrl.m_sHost + str;
    1113                 }
    1114             } 
    1115             else if( c1!='/' && c2!='/')//若两个都不是,则加上/构成新的url
    1116             {
    1117                 string::size_type idx;
    1118 
    1119                 idx = m_sUrl.rfind('/');
    1120                 if( idx != string::npos )//若不是最后
    1121                 {
    1122                     if( idx > 6 )
    1123                     { // > strlen("http://..")
    1124                         str = m_sUrl.substr(0, idx+1) + str;
    1125                     } 
    1126                     else 
    1127                     {
    1128                         str = m_sUrl + "/" + str;
    1129                     }
    1130 
    1131                 } else {
    1132 
    1133                     continue;
    1134                 }
    1135 
    1136             } 
    1137             else 
    1138             {
    1139                 if( c1=='/' )
    1140                 {
    1141                     str = m_sUrl + str;
    1142                 }
    1143                 else 
    1144                 {
    1145                     str = m_sUrl + "/" + str;
    1146                 }
    1147             }
    1148         }
    1149 
    1150         if( NormalizeUrl(str) == false ) continue;
    1151 
    1152         if( IsFilterLink(str) ) continue;
    1153 
    1154         //debug
    1155         //cout << "reflink: " << reflink << endl;
    1156 
    1157         if( str == m_sUrl )//一个网页中提取的超链接是其本身,我就不要了,因为我们已经有了这个网页的URL了
    1158         {
    1159             continue;
    1160         }
    1161         else
    1162         {
    1163             if( m_RefLink4SE[i].anchor_text )//有URL的描述符
    1164             {
    1165                 if( m_mapLink4SE.find(str) == m_mapLink4SE.end() )
    1166                 {
    1167                     m_mapLink4SE.insert( valType( str, m_RefLink4SE[i].anchor_text));
    1168                 }
    1169             }
    1170             else//没有URL的描述符---这个时候描述符为'\0'
    1171             {
    1172                 if( m_mapLink4SE.find(str) == m_mapLink4SE.end() )
    1173                 {
    1174                     m_mapLink4SE.insert( valType( str, "\0") );
    1175                     cout << ".";
    1176                 }
    1177             }
    1178         }
    1179             
    1180 
    1181     }
    1182 
    1183     m_nRefLink4SENum = m_mapLink4SE.size();
    1184 
    1185     //cout << endl;
    1186 
    1187     return true;
    1188 }
    1189 
    1190 
    1191 
    1192 
    1193 
    1194 //最终得到为历史网页存档准备的超链接
    1195 
    1196 //并将相对路径的URL和绝对路径的URL分别处理,同时,我们发现从一个网页中提取的超链接可以是相同的,这个时候
    1197 
    1198 //我们必须去重,这个函数用vector容器很好的做到了这一点
    1199 
    1200 //还有一些URL不是正规的URL也要过滤
    1201 
    1202 //还有一些URL是必要过滤也要过滤--通过IsFilterLink(string strUrl)实现
    1203 bool CPage::FindRefLink4History()
    1204 {
    1205     if( m_sLinkInfo4History.empty() ) return false;
    1206 
    1207     char *buffer = (char*)m_sLinkInfo4History.c_str();
    1208     int urlnum=0,len;
    1209     char *ptr ;
    1210 
    1211     static char buf[URL_REFERENCE_LEN/2];
    1212 
    1213     memset(buf, 0, URL_REFERENCE_LEN/2);
    1214     len = strlen(buffer);
    1215     if( len < 8 ) return false;
    1216 
    1217     len = len < URL_REFERENCE_LEN/2 - 1? len : URL_REFERENCE_LEN/2 -1;
    1218     strncpy( buf, buffer, len);
    1219 
    1220 /*first
    1221  *------>
    1222  */
    1223     ptr = buf;
    1224     while( ptr - buf < len  && *ptr ){
    1225         while( *ptr == '"' && *ptr) ptr++;
    1226         if ( !*ptr ) break;
    1227         this->m_RefLink4History[ urlnum].link = ptr;
    1228 
    1229         while( *ptr && *ptr != '>'){
    1230             if( *ptr == ' ') *ptr='\0';
    1231             ptr++;
    1232         }
    1233 
    1234         if( !*ptr){
    1235             urlnum++;
    1236             break;
    1237         }
    1238         if( *ptr == '>' ){
    1239             *ptr++ = 0;
    1240             if( !*ptr ){
    1241                 urlnum++;
    1242                 break;
    1243             }
    1244             if( *ptr == '"' ){
    1245             
    1246             }else{
    1247                 while( *ptr && *ptr != '"') ptr++;
    1248                 if( !*ptr ){
    1249                     urlnum++;
    1250                     break;
    1251                 }
    1252                 if ( *ptr == '"' ) *ptr++='\0';
    1253             }
    1254         }
    1255         
    1256         ptr++;
    1257         urlnum++;
    1258         if ( urlnum == MAX_URL_REFERENCES/2) break;
    1259     }
    1260 
    1261 
    1262     this->m_nRefLink4HistoryNum = urlnum;
    1263 
    1264 /*second
    1265  *------>
    1266  */
    1267     m_vecLink4History.clear();
    1268     //string strRootUrl= m_sUrl;
    1269         CUrl iUrl;
    1270         if( iUrl.ParseUrlEx(m_sUrl) == false ){
    1271         cout << "ParseUrlEx error in FindRefLink4History(): " << m_sUrl << endl;
    1272         return false;
    1273     }
    1274 
    1275     for(int i=0; i<m_nRefLink4HistoryNum; i++){
    1276         string str;
    1277         //string::size_type idx;
    1278 
    1279         str = m_RefLink4History[i].link;
    1280         if( str.size()==0 || str.size() > URL_LEN - 1 
    1281             || str.size() < 4 ) continue;
    1282 
    1283 /*
    1284         char *pdest1, *pdest2;
    1285         pdest1 = strstr( str.c_str(), "http" );
    1286         pdest2 = strstr( str.c_str(), "HTTP" );
    1287         if( pdest1==NULL && pdest2==NULL ){
    1288 */
    1289 
    1290         string::size_type idx1;
    1291         idx1 = CStrFun::FindCase(str, "http");
    1292         if( idx1 != 0 ){
    1293             char c1 = m_sUrl.at(m_sUrl.length()-1);
    1294             char c2 = str.at(0);
    1295 
    1296             if( c2=='/' ){
    1297                 if( iUrl.m_nPort != 80 ){
    1298                     str = "http://" + iUrl.m_sHost + ":" + CStrFun::itos(iUrl.m_nPort) + str;
    1299                 } else {
    1300                     str = "http://" + iUrl.m_sHost + str;
    1301                 }
    1302             } else if( c1!='/' && c2!='/'){
    1303                 string::size_type idx;
    1304 
    1305                 idx = m_sUrl.rfind('/');
    1306                 if( idx != string::npos ){
    1307                     if( idx > 6 ){ // > strlen("http://..")
    1308                         str = m_sUrl.substr(0, idx+1) + str;
    1309                     } else {
    1310                         str = m_sUrl + "/" + str;
    1311                     }
    1312 
    1313                 } else {
    1314 
    1315                     continue;
    1316                 }
    1317 
    1318             } else {
    1319                 if( c1=='/' ){
    1320                     str = m_sUrl + str;
    1321                 } else {
    1322                     str = m_sUrl + "/" + str;
    1323                 }
    1324             }
    1325         }
    1326 
    1327         // due to bad link parser
    1328 /*
    1329 
    1330         idx = reflink.find(' ');
    1331         if(idx != string::npos){
    1332             reflink = reflink.substr(0,idx);
    1333         }
    1334         idx = reflink.find('"');
    1335         if(idx != string::npos){
    1336             reflink = reflink.substr(0,idx);
    1337         }
    1338 */
    1339         //#############
    1340 
    1341         if( NormalizeUrl(str) == false ) continue;
    1342 
    1343 
    1344         if( IsFilterLink(str) ) continue;
    1345 
    1346 
    1347         if( str == m_sUrl ){
    1348             continue;
    1349         }else{
    1350             vector<string>::iterator it;
    1351             it = find(m_vecLink4History.begin(), m_vecLink4History.end(),str);
    1352             if( it == m_vecLink4History.end() ){
    1353 
    1354                 m_vecLink4History.push_back( str);
    1355                 cout << ".";
    1356             }
    1357         }
    1358             
    1359 
    1360     }
    1361     m_nRefLink4HistoryNum = m_vecLink4History.size();
    1362     //cout << endl;
    1363 
    1364     return true;
    1365 }
  • 相关阅读:
    生成建表脚本(V2.0)
    QQ抢车位游戏PhysicalDataModel
    自定义工资公式设计
    .NET开发人员必知的八个网站
    MongoDB 学习 error以及解决方法
    lm_sensors 查看硬件信息
    setuid on shell scripts
    SSD寿命状态检测
    SysBench安装使用。
    用rsync对网站进行镜像备份(不靠谱)
  • 原文地址:https://www.cnblogs.com/kakamilan/p/2572060.html
Copyright © 2011-2022 走看看