zoukankan      html  css  js  c++  java
  • CUrl

     1 #ifndef _URL_H_030728_
     2 #define _URL_H_030728_
     3 
     4 #include <string>
     5 
     6 const unsigned int URL_LEN    = 256;
     7 const unsigned int HOST_LEN    = 256;
     8 
     9 using namespace std;
    10 
    11 
    12 enum url_scheme {
    13     SCHEME_HTTP,
    14     SCHEME_FTP,
    15     SCHEME_INVALID
    16 };
    17 
    18 const int DEFAULT_HTTP_PORT = 80;
    19 const int DEFAULT_FTP_PORT  = 21;
    20 
    21 class CUrl
    22 {
    23 public:
    24     string m_sUrl;            // 原始的url地址
    25     enum url_scheme m_eScheme;    // URL 类型
    26 
    27     string    m_sHost;        // 提取出来的主机地址
    28     int    m_nPort;        // 主机端口号
    29     string    m_sPath;        //路径
    30 
    31 
    32 public:
    33     CUrl();
    34     ~CUrl();
    35 
    36     //bool ParseUrl(string strUrl);
    37 
    38     // break  an URL into scheme, host, port and request.
    39     // result as member variants
    40     bool ParseUrlEx(string strUrl);
    41 
    42     // break an URL into scheme, host, port and request.
    43     // result url as argvs
    44     void ParseUrlEx(const char *url, char *protocol, int lprotocol,
    45             char *host, int lhost,
    46             char *request, int lrequest, int *port);
    47 
    48     // get the ip address by host name
    49     char *GetIpByHost(const char *host);
    50 
    51     bool IsValidHost(const char *ip);
    52     bool IsForeignHost(string host);
    53     bool IsImageUrl(string url);
    54     bool IsValidIp(const char *ip);
    55     bool IsVisitedUrl(const char *url);
    56     bool IsUnReachedUrl(const char *url);
    57     bool IsValidHostChar(char ch);
    58 
    59 //private:
    60     void ParseScheme (const char *url);
    61 };
    62 
    63 extern pthread_mutex_t mutexMemory;
    64 
    65 #endif /* _URL_H_030728_ */
      1 /* URL handling
      2  */
      3 
      4 #include <iostream>
      5 #include <string.h>
      6 #include <sys/socket.h>
      7 #include <netdb.h>
      8 #include <map>
      9 #include "Url.h"
     10 #include <stdlib.h>
     11 #include <arpa/inet.h>
     12 
     13 //#include "Tse.h"
     14 //#include "Url.h"
     15 //#include "Http.h"
     16 //#include "Md5.h"
     17 //#include "StrFun.h"
     18 
     19 
     20 
     21 //
     22 ///* Is X "."?  */
     23 #define DOTP(x) ((*(x) == '.') && (!*(sdfx + 1)))
     24 ///* Is X ".."?  */
     25 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
     26 
     27 map<string,string> mapCacheHostLookup;
     28 //extern vector<string> vsUnreachHost;
     29 //pthread_mutex_t mutexCacheHost = PTHREAD_MUTEX_INITIALIZER;
     30 //extern set<string> setVisitedUrlMD5;
     31 //extern map<unsigned long,unsigned long> mapIpBlock;
     32 typedef map<string,string>::value_type valTypeCHL;
     33 
     34 struct scheme_data
     35 {
     36     const char *leading_string;
     37     int default_port;
     38     int enabled;
     39 };
     40 
     41 /* 支持的网页类型 */
     42 static struct scheme_data supported_schemes[] =
     43 {
     44     { "http://",  DEFAULT_HTTP_PORT,  1 },
     45     { "ftp://",   DEFAULT_FTP_PORT,   1 },
     46 
     47     /* 不合法的网页 */
     48     { NULL,       -1,                 0 }
     49 };
     50 
     51 /* 分析类型,若是合法则返回正确的,否则是不合法的  */
     52 void CUrl::ParseScheme (const char *url)
     53 {
     54     int i;
     55 
     56     for (i = 0; supported_schemes[i].leading_string; i++)
     57         if (0 == strncasecmp (url, supported_schemes[i].leading_string,strlen (supported_schemes[i].leading_string)))//比较url的前几个字母
     58         {
     59             if (supported_schemes[i].enabled)
     60             {
     61                 this->m_eScheme = (enum url_scheme) i;
     62                 return;
     63             }
     64             else
     65             {
     66                 this->m_eScheme = SCHEME_INVALID;
     67                 return;
     68             }
     69         }
     70 
     71     this->m_eScheme = SCHEME_INVALID;
     72     return;
     73 }
     74 
     75 /************************************************************************
     76  *  Function name: ParseUrlEx
     77  *  Input argv:
     78  *      -- strUrl: url
     79  *  Output argv:
     80  *      --
     81  *  Return:
     82        true: success
     83        false: fail
     84  *  Fucntion Description: break an URL into scheme, host, port and request.
     85  *              result as member variants
     86  *  Be careful:    release the memory by the client
     87 ************************************************************************/
     88 
     89 bool CUrl::ParseUrlEx(string strUrl)
     90 {
     91     char protocol[10];
     92     char host[HOST_LEN];
     93     char request[256];
     94     int port = -1;
     95 
     96     memset( protocol, 0, sizeof(protocol) );
     97     memset( host, 0, sizeof(host) );
     98     memset( request, 0, sizeof(request) );
     99 
    100     this->ParseScheme(strUrl.c_str());
    101     if( this->m_eScheme != SCHEME_HTTP )
    102     {
    103         return false;
    104     }
    105 
    106     ParseUrlEx(strUrl.c_str(),
    107             protocol, sizeof(protocol),
    108             host, sizeof(host),
    109             request, sizeof(request),
    110             &port);
    111 
    112     m_sUrl  = strUrl;
    113     m_sHost = host;
    114     m_sPath = request;
    115 
    116     if( port > 0 ){
    117         m_nPort = port;
    118     }
    119 
    120     return true;
    121 }
    122 
    123 /************************************************************************
    124  *  Function name: ParseUrlEx
    125  *  Input argv:
    126  *      -- url: host name
    127  *      -- protocol: result protocol
    128  *      -- lprotocol: protocol length
    129  *      -- host: result host
    130  *      -- lhost: host length
    131  *      -- request: result request
    132  *      -- lrequest: request length
    133  *  Output argv:
    134  *      --
    135  *  Return:
    136        true: success
    137        false: fail
    138  *  Fucntion Description: break an URL into scheme, host, port and request.
    139  *              result as argvs
    140  *  Be careful:
    141 ************************************************************************/
    142 void CUrl::ParseUrlEx(const char *url,
    143         char *protocol, int lprotocol,
    144         char *host, int lhost,
    145         char *request, int lrequest,
    146         int *port)
    147 {
    148     char *work,*ptr,*ptr2;
    149 
    150     *protocol = *host = *request = 0;
    151     *port = 80;
    152 
    153     int len = strlen(url);
    154     //pthread_mutex_lock(&mutexMemory);
    155     work = new char[len + 1];
    156     //pthread_mutex_unlock(&mutexMemory);
    157     memset(work, 0, len+1);
    158     strncpy(work, url, len);
    159     //把url的内容复制到work中
    160 
    161     // find protocol if any
    162 //在work中查找:(默认的是http)
    163     ptr = strchr(work, ':');
    164     if( ptr != NULL )
    165     {
    166         *(ptr++) = 0;
    167         strncpy( protocol, work, lprotocol );
    168     } else {
    169         strncpy( protocol, "HTTP", lprotocol );
    170         ptr = work;
    171     }
    172 
    173     // skip past opening /'s
    174 //调过 // 
    175     if( (*ptr=='/') && (*(ptr+1)=='/') )
    176         ptr+=2;
    177 
    178     // 查找主机地址
    179     ptr2 = ptr;
    180     while( IsValidHostChar(*ptr2) && *ptr2 )
    181         ptr2++;
    182     *ptr2 = 0;//保证合法的字符串
    183     strncpy( host, ptr, lhost );
    184 
    185     //查找请求的网页
    186     int offset = ptr2 - work;
    187     const char *pStr = url + offset;
    188     strncpy( request, pStr, lrequest );
    189 
    190     //找到主机的端口
    191     ptr = strchr( host, ':' );
    192     if( ptr != NULL ){
    193         *ptr = 0;
    194         *port = atoi(ptr+1);
    195     }
    196 
    197     //pthread_mutex_lock(&mutexMemory);
    198     delete [] work;
    199     //pthread_mutex_unlock(&mutexMemory);
    200     work = NULL;
    201 }
    202 
    203 
    204 
    205 
    206 
    207 
    208 /* scheme://user:pass@host[:port]... 
    209  *                    ^              
    210  * We attempt to break down the URL into the components path,
    211  * params, query, and fragment.  They are ordered like this:
    212  * scheme://host[:port][/path][;params][?query][#fragment] 
    213  */
    214 
    215 /*
    216 bool CUrl::ParseUrl(string strUrl)
    217 {
    218     string::size_type idx;
    219 
    220     this->ParseScheme(strUrl.c_str());    
    221     if( this->m_eScheme != SCHEME_HTTP )
    222         return false;
    223 
    224     // get host name
    225     this->m_sHost = strUrl.substr(7);
    226     idx = m_sHost.find('/');
    227     if(idx != string::npos){
    228         m_sHost = m_sHost.substr(0,idx);
    229     }
    230 
    231     this->m_sUrl = strUrl;
    232 
    233     return true;
    234 }
    235 */
    236 //CUrl的构造函数
    237 CUrl::CUrl()
    238 {
    239     this->m_sUrl = ""; 
    240     this->m_eScheme= SCHEME_INVALID;
    241         
    242     this->m_sHost = "";  
    243     this->m_nPort = DEFAULT_HTTP_PORT; //默认端口
    244         
    245     this->m_sPath = "";
    246     /*
    247     this->m_sParams = "";
    248     this->m_sQuery = "";
    249     this->m_sFragment = "";
    250 
    251     this->m_sDir = "";
    252     this->m_sFile = "";
    253         
    254         this->m_sUser = "";
    255     this->m_sPasswd = "";
    256     */
    257 
    258 }
    259 
    260 CUrl::~CUrl()
    261 {
    262 
    263 }
    264 
    265 
    266 /****************************************************************************
    267  *  Function name: GetIpByHost
    268  *  Input argv:
    269  *      -- host: host name
    270  *  Output argv:
    271  *      --
    272  *  Return:
    273        ip: sucess
    274        NULL: fail
    275  *  Function Description: get the ip address by host name
    276  *  Be careful: release the memory by the client
    277 ****************************************************************************/
    278 //通过主机地址获得IP地址
    279 char * CUrl::GetIpByHost(const char *host)
    280 {
    281         
    282     if( !host ){    // null pointer
    283         return NULL;
    284         cout<<"f1";
    285     }
    286 
    287     if( !IsValidHost(host) ){    // invalid host
    288         return NULL;
    289         cout<<"f2";
    290     }
    291     unsigned long inaddr = 0;
    292     char *result = NULL;
    293     int len = 0;
    294 
    295 
    296     inaddr = (unsigned long)inet_addr( host );//将字符串IP转化为32二进制的网络字节序
    297     //if ( (int)inaddr != -1){ 
    298     if ( inaddr != INADDR_NONE)
    299     { // 主机地址就是用IP地址表示的
    300         len = strlen(host);
    301         //pthread_mutex_lock(&mutexMemory);
    302         result = new char[len+1];
    303         cout<<result;
    304         //pthread_mutex_unlock(&mutexMemory);
    305         memset(result, 0, len+1);
    306         memcpy(result, host, len);
    307 
    308         return result;
    309     } 
    310     else 
    311     {
    312         //firt find from cache
    313         
    314         map<string,string>::iterator it  = mapCacheHostLookup.find(host);
    315         //可以在DNS缓存中找到
    316         if( it != mapCacheHostLookup.end() )
    317         {    //如果在cache中找到IP地址
    318             const char * strHostIp;
    319 
    320             strHostIp = (*it).second.c_str();
    321 
    322             inaddr = (unsigned long)inet_addr( strHostIp );
    323             //if ( (int)inaddr != -1){ 
    324             if ( inaddr != INADDR_NONE )
    325             { 
    326                 len = strlen(strHostIp);
    327                 //pthread_mutex_lock(&mutexMemory);
    328                 result = new char[len+1];
    329                 //pthread_mutex_unlock(&mutexMemory);
    330                 memset( result, 0, len+1 );
    331                 memcpy( result, strHostIp, len );
    332 
    333                 //cout << ":)" ;
    334                 
    335                 return result;
    336             }
    337         }
    338     }
    339 
    340     //通过上面的方法我们都没有查找,这个时候我们只能通过DNS server查找了,这种带宽的消耗是必要的!
    341     struct hostent *hp;    /* Host entity */
    342     hp = gethostbyname(host);
    343     //通过主机号或者说是域名得到hostent结构,这个结构包含主机号或者说域名的很多信息,例如我们要找的IP字符串就在其中
    344     if(hp == NULL) { 
    345         //cout << "gethostbyname() error in GetIpByHost: " << host << endl;
    346         return NULL;
    347     }
    348 
    349     // cache host lookup
    350     struct  in_addr in;
    351 
    352     bcopy(*(hp->h_addr_list), (caddr_t)&in, hp->h_length);
    353     /*功能:将字符串src的前n个字节复制到dest中
    354      说明:bcopy不检查字符串中的空字节NULL,函数没有返回值。*/
    355         
    356     char    abuf[INET_ADDRSTRLEN];
    357     if( inet_ntop(AF_INET, (void *)&in,abuf, sizeof(abuf)) == NULL )
    358     {
    359         cout << "inet_ntop() return error in GetIpByHost" << endl;
    360         return NULL;
    361 
    362     } 
    363     else
    364     {
    365 
    366         //if( mapCacheHostLookup.count(host) == 0){
    367         if( mapCacheHostLookup.find(host) == mapCacheHostLookup.end() ){
    368         
    369             //cout << endl << host << " and " << abuf << endl;
    370             mapCacheHostLookup.insert( valTypeCHL ( host, abuf));
    371             //更新DNS缓存
    372             //cout<<((*mapCacheHostLookup.find("home.ustc.edu.cn")).second.c_str());
    373 
    374         }
    375 
    376     }
    377 
    378     // return result
    379     len = strlen(abuf);
    380     //pthread_mutex_lock(&mutexMemory);
    381     result = new char[len + 1];
    382     //pthread_mutex_unlock(&mutexMemory);
    383     memset( result, 0, len+1 );
    384     memcpy( result, abuf, len );
    385 
    386     return result;
    387 }
    388 
    389 /**********************************************************************************
    390  *  Function name: IsValidHostChar
    391  *  Input argv:
    392  *      -- ch: the character for testing
    393  *  Output argv:
    394  *      -- 
    395  *  Return:
    396        true: is valid
    397        false: is invalid
    398  *  Function Description: test the specified character valid
    399  *              for a host name, i.e. A-Z or 0-9 or -.:
    400 **********************************************************************************/
    401 bool CUrl::IsValidHostChar(char ch)
    402 {
    403     return( isalpha(ch) || isdigit(ch)
    404         || ch=='-' || ch=='.' || ch==':' || ch=='_');
    405 }
    406 
    407 /**********************************************************************************
    408  *  Function name: IsValidHost
    409  *  Input argv:
    410  *      -- ch: the character for testing
    411  *  Output argv:
    412  *      -- 
    413  *  Return:
    414        true: is valid
    415        false: is invalid
    416  *  Function Description: test the specified character valid
    417  *              for a host name, i.e. A-Z or 0-9 or -.:
    418  *  Be careful:
    419 **********************************************************************************/
    420 bool CUrl::IsValidHost(const char *host)
    421 {
    422     if( !host ){//空的主机号,我们认为是无效的主机号
    423         return false;
    424     }
    425 
    426     if( strlen(host) < 6 ){ //主机号长度小于6,我们认为ieshi无效的主机号
    427         return false;
    428     }
    429 
    430     char ch;
    431     for(unsigned int i=0; i<strlen(host); i++){
    432         ch = *(host++);
    433         if( !IsValidHostChar(ch) ){
    434             return false;
    435         }
    436     }
    437 
    438     return true;
    439 }
    440 
    441 /**********************************************************************************
    442  *  Function name: IsVisitedUrl
    443  *  Input argv:
    444  *      -- url: url
    445  *  Output argv:
    446  *      -- 
    447  *  Return:
    448        true: is visited
    449        false: not visited
    450  *  Function Description: test the url visited by the MD5
    451  *  Be careful:
    452 **********************************************************************************/
    453 bool CUrl::IsVisitedUrl(const char *url)//判断该URL是否访问过
    454 {
    455     if( !url ){
    456         return true; // if be null, we think it have been visited
    457     }
    458 
    459     CMD5 iMD5;
    460     iMD5.GenerateMD5( (unsigned char*)url, strlen(url) );
    461     string strDigest = iMD5.ToString();
    462 
    463     if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) {
    464         return true;
    465     } else {
    466         return false;
    467     }
    468 
    469 }
    470 
    471 
    472 /**********************************************************************************
    473  *  Function name: IsValidIp
    474  *  Input argv:
    475  *      -- ip: ip
    476  *  Output argv:
    477  *      -- 
    478  *  Return:
    479        true: inside the ip block
    480        false: outside the ip block
    481  *  Function Description: decide teh ip whether or not inside the ip block
    482  *  Be careful:
    483 **********************************************************************************/
    484 bool CUrl::IsValidIp(const char *ip)
    485 {
    486     if( ip == NULL )
    487     {
    488         return false;
    489     }
    490 
    491     unsigned long inaddr = (unsigned long)inet_addr(ip);
    492     if( inaddr == INADDR_NONE ){//显然该IP参数不是正确的字符串IP
    493         return false;
    494     }
    495 
    496     if (mapIpBlock.size() > 0) { //判断是否要过滤掉
    497         map<unsigned long, unsigned long>::iterator pos;
    498         for (pos = mapIpBlock.begin(); pos != mapIpBlock.end(); ++pos) {
    499             unsigned long ret;
    500 
    501             ret = inaddr & ~((*pos).second);
    502             if (ret == (*pos).first) { // inside
    503                 return true;
    504             }
    505         }
    506 
    507         // outside
    508         return false;
    509     }
    510 
    511 
    512     // if block range is not given, we think it inside also
    513     return true;
    514 }
    515 /*
    516  * If it is, return true; otherwise false
    517  * not very precise
    518  */
    519 bool CUrl::IsForeignHost(string host)
    520 {
    521     if( host.empty() ) return true;
    522     if( host.size() > HOST_LEN ) return true;
    523 
    524     unsigned long inaddr = 0;
    525 
    526     inaddr = (unsigned long)inet_addr( host.c_str() );
    527     if ( inaddr != INADDR_NONE){ // host is just ip
    528         return false;
    529     }
    530 
    531     string::size_type idx = host.rfind('.');
    532     string tmp;
    533     if( idx != string::npos ){
    534         tmp = host.substr(idx+1);
    535     }
    536 
    537     CStrFun::Str2Lower( tmp, tmp.size() );
    538     const char *home_host[] ={
    539         "cn","com","net","org","info",
    540         "biz","tv","cc", "hk", "tw"
    541     };
    542 
    543     int home_host_num = 10;
    544 
    545     for(int i=0; i<home_host_num; i++){
    546         if( tmp == home_host[i] )
    547             return false;
    548     }
    549 
    550     return true;
    551 }
    552     
    553     
    554 bool CUrl::IsImageUrl(string url)
    555 {
    556     if( url.empty() ) return false;
    557     if( url.size() > HOST_LEN ) return false;
    558 
    559     string::size_type idx = url.rfind('.');
    560     string tmp;
    561     if( idx != string::npos ){
    562         tmp = url.substr(idx+1);
    563     }
    564 
    565     CStrFun::Str2Lower( tmp, tmp.size() );
    566     const char *image_type[] ={
    567         "gif","jpg","jpeg","png","bmp",
    568         "tif","psd"
    569     };
    570 
    571     int image_type_num = 7;
    572 
    573     for (int i=0; i<image_type_num; i++)
    574     {
    575         if( tmp == image_type[i] )
    576             return true;
    577     }
    578 
    579     return false;
    580 }
  • 相关阅读:
    内存池(MemPool)技术详解
    关于项目时间管理的六点须知
    如何与你的老大沟通?
    一个简单的面试题
    Windows下的Memcache安装与测试教程
    反向代理服务器的工作原理
    Linux下的Memcache安装方法
    TCP/IP协议详解
    浅谈负载均衡技术与分类
    MySQL数据备份和恢复的方法大全
  • 原文地址:https://www.cnblogs.com/kakamilan/p/2578412.html
Copyright © 2011-2022 走看看