zoukankan      html  css  js  c++  java
  • CHttp

     1 #ifndef _HTTP_H_031105_
     2 #define _HTTP_H_031105_
     3 
     4 #include <map>
     5 
     6 using namespace std;
     7 
     8 class CHttp
     9 {
    10 private:
    11     string m_strUrl;    // url
    12     int *m_sock;        // socket
    13 
    14 public:
    15     CHttp();
    16     virtual ~CHttp();
    17 
    18 
    19     //strUrl:  待抓取的网页对应的URL
    20     //fileBuf: 网页体信息
    21     //fileHead:网页头信息
    22     //location:网页如果重定向对应的URL
    23     //sock:套接子文件描述符
    24     int Fetch(string strUrl, char **fileBuf, 
    25         char **fileHead, char **location, int* sock);
    26 
    27 private:
    28     //下面4个私有的成员函数--被 Fetch()函数调用
    29     //通过IO复用的方法读取网页头信息
    30     int read_header(int sock, char *headerPtr);
    31     
    32     //创建套接字文件描述符
    33     int CreateSocket(const char *host, int port);
    34 
    35     //被CreateSocket()调用,通过IO复用的方法连接目标服务器
    36     int nonb_connect(int, struct sockaddr*, int);
    37 
    38     //检测*buf所指的内存空间剩余值是否大于more,不够再加more+1单位的内存空间
    39     int checkBufSize(char **buf, int *bufsize, int more);
    40 
    41 };
    42 
    43 extern pthread_mutex_t mutexMemory;
    44 
    45 #endif /* _HTTP_H_031105_ */
       1 #include <stdlib.h>
       2 #include <stdio.h>
       3 #include <string.h>
       4 #include <strings.h>
       5 #include <errno.h>
       6 #include <netdb.h>
       7 #include <unistd.h>
       8 #include <netinet/in.h>
       9 #include <sys/types.h>
      10 #include <sys/socket.h>
      11 #include <sys/time.h>
      12 #include <fcntl.h>
      13 #include <iostream>
      14 #include "Http.h"
      15 
      16 //#include "Tse.h"
      17 #include "CommonDef.h"
      18 #include "Url.h"
      19 //#include "Page.h"
      20 #include "StrFun.h"
      21 
      22 char *userAgent = NULL;
      23 int timeout = DEFAULT_TIMEOUT;//设置最长的等待时间30秒
      24 int hideUserAgent = 0;
      25 
      26 CHttp::CHttp()
      27 {
      28 }
      29 
      30 CHttp::~CHttp()
      31 {
      32 }
      33 
      34 
      35     /*
      36          * Actually downloads the page, registering a hit (donation)
      37          *      If the fileBuf passed in is NULL, the url is downloaded and then
      38          *      freed; otherwise the necessary space is allocated for fileBuf.
      39          *      Returns size of download on success, 
      40             -1 on error is set,
      41              -2 out of ip block,
      42              -3 invalid host,
      43             -4 MIME is imag/xxx
      44              -300 on 301.
      45          */
      46 
      47 
      48 
      49 /*
      50 
      51 function:
      52 
      53 success: return bytesRead[网页体信息的真实的字节数]
      54 
      55 fail:    return -1  各种其他的错误
      56 
      57          return -2  在IP阻塞范围内
      58 
      59          return -3  无效的主机号
      60 
      61          return -4  image/text类型
      62 
      63          return -300 网页重定向
      64 
      65 strUrl:  待抓取的网页对应的URL
      66 
      67 fileBuf: 网页体信息
      68 
      69 fileHead:网页头信息
      70 
      71 location:网页如果重定向对应的URL
      72 
      73 sock:套接子文件描述符
      74 
      75 */
      76 int CHttp::Fetch(string strUrl, char **fileBuf, char **fileHeadBuf, char **location, int* nPSock )
      77 {
      78     char *tmp, *url, *requestBuf, *pageBuf;
      79     const char *host, *path;
      80     int sock, bytesRead = 0, bufsize = REQUEST_BUF_SIZE;
      81     int ret = -1, tempSize, selectRet;
      82     int port = 80;
      83 
      84 
      85     if( strUrl.empty() )//空的URL肯定不能抓取到网页
      86     {
      87         cout << "strUrl is NULL" << endl;
      88         return -1;
      89     }
      90 
      91     /* Copy the url passed in into a buffer we can work with, change, etc. */
      92 /*
      93     url = (char*)malloc(strUrl.length()+1);
      94     if( url == NULL ){
      95         cout << "can not allocate enought memory for url" << endl;
      96         return -1;
      97     } else {
      98         memset(url, 0,strUrl.length()+1);
      99         memcpy(url, strUrl.c_str(), strUrl.length() );
     100     }
     101 */
     102     //pthread_mutex_lock(&mutexMemory);
     103     url = strdup(strUrl.c_str());//复制url
     104     //pthread_mutex_unlock(&mutexMemory);
     105     if( url == NULL )//分配失败
     106     {
     107         cout << "!error: stdup() in Fetch()" << endl;
     108         return -1;
     109     }
     110 
     111     // parse the url
     112     CUrl u;
     113     if( u.ParseUrlEx(url) == false )
     114     {
     115         //如果没有"http://"协议号,肯定会解析错误
     116         cout << "ParseUrlEx error in Fetch(): " << strUrl << endl;
     117         return -1;
     118     }
     119 
     120     host = u.m_sHost.c_str();
     121     path = u.m_sPath.c_str();
     122     if( u.m_nPort > 0 ) port = u.m_nPort;
     123 
     124     /* Compose a request string */
     125     //pthread_mutex_lock(&mutexMemory);
     126 
     127     /*构造HTTP请求报文:  假设strUrl="http://www.baidu.com/ecjtu/nihao.html"*/
     128     // GET /ecjtu/nihao.html HTTP/1.0\r\n
     129     requestBuf = (char*)malloc(bufsize);
     130     //pthread_mutex_unlock(&mutexMemory);
     131     if(requestBuf == NULL)
     132     {
     133         if (url)
     134         {
     135             //pthread_mutex_lock(&mutexMemory);
     136             free(url);
     137             url=NULL;
     138             //pthread_mutex_unlock(&mutexMemory);
     139         }
     140         cout << "can not allocate enought memory for requestBuf" << endl;
     141         return -1;
     142     }
     143     requestBuf[0] = 0;
     144 
     145     if( strlen(path) < 1 )//说明请求的是根目录下的网页
     146     {
     147         // GET / HTTP/1.0\r\n
     148         /* The url has no '/' in it, assume the user is making a root-level
     149                  *      request */
     150         tempSize = strlen("GET /") + strlen(HTTP_VERSION) +2;
     151 /*
     152         if( tempSize > bufsize ){
     153             free(url);
     154             free(requestBuf);
     155             cout << "tempSize larger than bufsize" << endl;
     156             return -1;
     157         }
     158 */
     159 
     160         if(checkBufSize(&requestBuf, &bufsize, tempSize) ||    snprintf(requestBuf, bufsize, "GET / %s\r\n", HTTP_VERSION) < 0 ){
     161             /*int snprintf(char *restrict buf, size_t n, const char * restrict  format, ...);
     162              函数说明:最多从源串中拷贝n-1个字符到目标串中,然后再在后面加一个0。所以如果目标串的大小为n
     163              的话,将不会溢出。*/
     164 
     165             //pthread_mutex_lock(&mutexMemory);
     166             if (url)
     167             {
     168                  free(url); 
     169                  url=NULL;
     170             }
     171             if (requestBuf)
     172             {
     173                  free(requestBuf); 
     174                  requestBuf=NULL;
     175             }
     176             //pthread_mutex_unlock(&mutexMemory);
     177             cout << "1.checkBuffSize(&requestBuf..) error" << endl;
     178             return -1;
     179         }
     180 
     181     }
     182     else//说明请求的是非根目录下的网页
     183     {
     184         tempSize = strlen("GET ") + strlen(path) + strlen(HTTP_VERSION) + 4;
     185 
     186         if(checkBufSize(&requestBuf, &bufsize, tempSize) ||    snprintf(requestBuf, bufsize, "GET %s %s\r\n", path, HTTP_VERSION) < 0)
     187         {
     188 
     189             //pthread_mutex_lock(&mutexMemory);
     190             if (url)
     191             {
     192                  free(url); 
     193                  url=NULL;
     194             }
     195             if (requestBuf)
     196             {
     197                  free(requestBuf); 
     198                  requestBuf=NULL;
     199             }
     200             //pthread_mutex_unlock(&mutexMemory);
     201             cout << "2._checkBuffSize(&requestBuf..) error" << endl;
     202             return -1;
     203         }
     204 
     205     }
     206 
     207 
     208     /* Use Host: even though 1.0 doesn't specify it.  Some servers
     209          *      won't play nice if we don't send Host, and it shouldn't hurt anything */
     210     tempSize = (int)strlen("Host: ") + (int)strlen(host) + 3;/* +3 for "\r\n\0" */
     211 
     212     if(checkBufSize(&requestBuf, &bufsize, tempSize + 128)){
     213         //pthread_mutex_lock(&mutexMemory);
     214         if (url)
     215         {
     216              free(url); url=NULL;
     217         }
     218         if (requestBuf)
     219         {
     220              free(requestBuf); requestBuf=NULL;
     221         }
     222         //pthread_mutex_unlock(&mutexMemory);
     223         cout << "3._checkBuffSize(&requestBuf..) error" << endl;
     224         return -1;
     225     }
     226 
     227     strcat(requestBuf, "Host: ");
     228     strcat(requestBuf, host);
     229     strcat(requestBuf, "\r\n");
     230 
     231     if(!hideUserAgent && userAgent == NULL) {
     232 
     233         tempSize = (int)strlen("User-Agent: ") +
     234             (int)strlen(DEFAULT_USER_AGENT) + (int)strlen(VERSION) + 4;
     235         if(checkBufSize(&requestBuf, &bufsize, tempSize)) {
     236             //pthread_mutex_lock(&mutexMemory);
     237             if (url)
     238             {
     239                  free(url); url=NULL;
     240             }
     241             if (requestBuf)
     242             {
     243                  free(requestBuf); requestBuf=NULL;
     244             }
     245             //pthread_mutex_unlock(&mutexMemory);
     246             cout << "4._checkBuffSize(&requestBuf..) error" << endl;
     247             return -1;
     248         }
     249         strcat(requestBuf, "User-Agent: ");
     250         strcat(requestBuf, DEFAULT_USER_AGENT);
     251         strcat(requestBuf, "/");
     252         strcat(requestBuf, VERSION);
     253         strcat(requestBuf, "\r\n");
     254 
     255     } else if(!hideUserAgent) {
     256 
     257         tempSize = (int)strlen("User-Agent: ") + (int)strlen(userAgent) + 3;
     258         if(checkBufSize(&requestBuf, &bufsize, tempSize)) {
     259 
     260             //pthread_mutex_lock(&mutexMemory);
     261             if (url)
     262             {
     263                  free(url); url=NULL;
     264             }
     265             if (requestBuf)
     266             {
     267                  free(requestBuf); requestBuf=NULL;
     268             }
     269             //pthread_mutex_unlock(&mutexMemory);
     270             cout << "5._checkBuffSize(&requestBuf..) error" << endl;
     271             return -1;
     272         }
     273         strcat(requestBuf, "User-Agent: ");
     274         strcat(requestBuf, userAgent);
     275         strcat(requestBuf, "\r\n");
     276     }
     277 
     278     //tempSize = (int)strlen("Connection: Close\n\n");
     279     tempSize = (int)strlen("Connection: Keep-Alive\r\n\r\n");
     280     if(checkBufSize(&requestBuf, &bufsize, tempSize)) {
     281         //pthread_mutex_lock(&mutexMemory);
     282         if (url)
     283         {
     284              free(url); url=NULL;
     285         }
     286         if (requestBuf)
     287         {
     288              free(requestBuf); requestBuf=NULL;
     289         }
     290         //pthread_mutex_unlock(&mutexMemory);
     291         cout << "6._checkBuffSize(&requestBuf..) error" << endl;
     292         return -1;
     293     }
     294 
     295 
     296     //strcat(requestBuf, "Connection: Close\n\n");
     297     strcat(requestBuf, "Connection: Keep-Alive\r\n\r\n");
     298 
     299 
     300     /* Now free any excess memory allocated to the buffer */
     301     //pthread_mutex_lock(&mutexMemory);
     302     //重新调整requestBuf的内存空间,释放多余的内存空间
     303     tmp = (char *)realloc(requestBuf, strlen(requestBuf) + 1);
     304     //pthread_mutex_unlock(&mutexMemory);
     305     if(tmp == NULL){
     306         //pthread_mutex_lock(&mutexMemory);
     307         if (url)
     308         {
     309              free(url); url=NULL;
     310         }
     311         if (requestBuf)
     312         {
     313              free(requestBuf); requestBuf=NULL;
     314         }
     315         //pthread_mutex_unlock(&mutexMemory);
     316         cout << "realloc for tmp error" << endl;
     317         return -1;
     318     }
     319     requestBuf = tmp;
     320 
     321     if( *nPSock != -1 ){
     322         sock = *nPSock;
     323         cout << "using privous socket " << *nPSock << endl;
     324     }else{
     325 
     326         // cout << "1.get a new one" << endl;
     327         sock = CreateSocket( host, port );
     328         if(sock == -1) { // invalid host
     329             //pthread_mutex_lock(&mutexMemory);
     330             if (url)
     331             {
     332                  free(url); url=NULL;
     333             }
     334             if (requestBuf)
     335             {
     336                  free(requestBuf); requestBuf=NULL;
     337             }
     338             //pthread_mutex_unlock(&mutexMemory);
     339             return -3;
     340         }
     341         if(sock == -2) { // out of ip block
     342             //pthread_mutex_lock(&mutexMemory);
     343             if (url)
     344             {
     345                  free(url); url=NULL;
     346             }
     347             if (requestBuf)
     348             {
     349                  free(requestBuf); requestBuf=NULL;
     350             }
     351             //pthread_mutex_unlock(&mutexMemory);
     352             //cout << "2.not able to MakeSocket" << endl;
     353             return -2;
     354         }
     355     }
     356     
     357 
     358 
     359     ret = write(sock, requestBuf, strlen(requestBuf));
     360     if( ret == 0 ){
     361         cout << "requestBuf is " << requestBuf << endl;
     362         cout << "write nothing" << endl;
     363         //pthread_mutex_lock(&mutexMemory);
     364         if (url)
     365         {
     366             free(url); url=NULL;
     367         }
     368         if (requestBuf)
     369         {
     370             free(requestBuf); requestBuf=NULL;
     371         }
     372         //pthread_mutex_unlock(&mutexMemory);
     373         close(sock);
     374         *nPSock = -1;
     375         return -1;
     376         
     377     }
     378     if( ret == -1){
     379         //cout << "write error" << endl;
     380         // sock is invalid,we should make a new one
     381         close(sock);
     382         *nPSock  = -1;
     383 
     384         cout << "2.close previous socket " << *nPSock << " and get a new one" << endl;
     385         //maybe sock is dead,try again
     386         sock = CreateSocket( host, port );
     387         if(sock == -1) { 
     388             //pthread_mutex_lock(&mutexMemory);
     389             if (url)
     390             {
     391                 free(url); url=NULL;
     392             }
     393             if (requestBuf)
     394             {
     395                 free(requestBuf); requestBuf=NULL;
     396             }
     397             //pthread_mutex_unlock(&mutexMemory);
     398             cout << "3.not able to MakeSocket" << endl;
     399             return -1;
     400         }
     401         if(sock == -2) { 
     402             //pthread_mutex_lock(&mutexMemory);
     403             if (url)
     404             {
     405                 free(url); url=NULL;
     406             }
     407             if (requestBuf)
     408             {
     409                 free(requestBuf); requestBuf=NULL;
     410             }
     411             //pthread_mutex_unlock(&mutexMemory);
     412             cout << "4.not able to MakeSocket" << endl;
     413             return -1;
     414         }
     415         if(write(sock, requestBuf, strlen(requestBuf)) == -1){
     416             //pthread_mutex_lock(&mutexMemory);
     417             if (url)
     418             {
     419                 free(url); url=NULL;
     420             }
     421             if (requestBuf)
     422             {
     423                 free(requestBuf); requestBuf=NULL;
     424             }
     425             //pthread_mutex_unlock(&mutexMemory);
     426             close(sock);
     427             *nPSock = -1;
     428             cout << "write error" << endl;
     429             return -1;
     430         }
     431     }
     432 
     433     //pthread_mutex_lock(&mutexMemory);
     434     if (url)
     435     {
     436         free(url); url=NULL;
     437     }
     438     if (requestBuf)
     439     {
     440         free(requestBuf); requestBuf=NULL;
     441     }
     442     //pthread_mutex_unlock(&mutexMemory);
     443 
     444 
     445     char headerBuf[HEADER_BUF_SIZE];
     446     /* Grab enough of the response to get the metadata */
     447     memset( headerBuf,0,HEADER_BUF_SIZE );
     448     //cout << "old sock is " << sock << endl;
     449     ret = read_header(sock, headerBuf);
     450     //cout << "ret = " << ret << endl;
     451     if(ret < 0) { 
     452         close(sock); 
     453         *nPSock = -1;
     454         return -1;
     455     }
     456 
     457     //cout << headerBuf << endl;
     458     if( strlen(headerBuf) == 0 ){
     459         cout << "strlen(headerBuf) = 0" << headerBuf << endl;
     460         cout << "strUrl: " << strUrl << endl << endl;;
     461         close(sock);
     462                 *nPSock = -1;
     463         return -1;
     464     }
     465 
     466 
     467 
     468      //解析网页头信息
     469     CPage iPage;
     470     iPage.ParseHeaderInfo(headerBuf);
     471     if (iPage.m_nStatusCode == -1)
     472     {
     473         close(sock);
     474         *nPSock = -1;
     475         cout << "headerBuf: " << headerBuf << endl;
     476         cout << "!header error: not find HTTP" << endl;
     477         return -1;
     478     }
     479 
     480 
     481 
     482     // deal with http://net.cs.pku.edu.cn/~cnds
     483     if (iPage.m_nStatusCode == 301 || iPage.m_nStatusCode == 302)
     484     {
     485         if (iPage.m_sLocation.empty() || iPage.m_sLocation.size()>URL_LEN)
     486         {    
     487             close(sock);
     488             *nPSock = -1;
     489             cout << headerBuf << endl;
     490             cout << "!error: Location" << endl;
     491             return -1;
     492         }
     493         else
     494         {
     495             //pthread_mutex_lock(&mutexMemory);
     496             char *loc=strdup(iPage.m_sLocation.c_str());
     497             //pthread_mutex_unlock(&mutexMemory);
     498             *location = loc;
     499             close(sock);
     500             *nPSock = -1;
     501             return -300;//重定向了
     502         }
     503     }
     504 
     505     if(iPage.m_nStatusCode<200 || iPage.m_nStatusCode>299 ){
     506         close(sock);
     507         *nPSock = -1;
     508         cout << "!header code = " << iPage.m_nStatusCode << endl;
     509         return -1;
     510     }
     511 
     512     // when crawling images for ImgSE, remember to comment the paragraph
     513     // when crawling plain text for SE, remember to open the paragraph
     514     // paragraph begin
     515     if( iPage.m_sContentType.find("image") != string::npos )
     516     { // 
     517         close(sock);
     518         *nPSock = -1;
     519         return -4;
     520     }
     521     // paragraph end
     522 
     523     if (iPage.m_nContentLength == -1)
     524     {
     525         close(sock);
     526         *nPSock = -1;
     527         cout << headerBuf << endl;
     528         cout << "!error: Content-length" << endl;
     529         return -1;
     530     }
     531 
     532     if (iPage.m_nContentLength==0 || iPage.m_nContentLength<20)
     533     { // Allocate enough memory to hold the page 
     534         iPage.m_nContentLength = DEFAULT_PAGE_BUF_SIZE;
     535     }
     536 
     537 
     538     if (iPage.m_nContentLength > MAX_PAGE_BUF_SIZE)
     539     {
     540          cout<<"这个网页的长度大于5M,我过滤掉它!"<<endl;
     541         cout << "the page discarded due to its size " 
     542             << iPage.m_nContentLength 
     543             << " is larger than " << MAX_PAGE_BUF_SIZE << endl;
     544         close(sock);
     545         *nPSock = -1;
     546         return -1;
     547     }
     548 
     549     //pthread_mutex_lock(&mutexMemory);
     550     pageBuf = (char *)malloc(iPage.m_nContentLength);
     551     //pthread_mutex_unlock(&mutexMemory);
     552     if(pageBuf == NULL){
     553         close(sock);
     554         *nPSock = -1;
     555         cout << "malloc for pageBuf" << endl;
     556         return -1;
     557     }
     558     
     559     /* Begin reading the body of the file */
     560     //开始读取网页体信息
     561     fd_set rfds;
     562     struct timeval tv;
     563     int flags;
     564     //将sock套接子文件描述符设置为非阻塞的方式
     565     flags=fcntl(sock,F_GETFL,0);
     566     if(flags<0)
     567     {
     568         close(sock);
     569         *nPSock = -1;
     570         if (pageBuf)
     571         {
     572             //pthread_mutex_lock(&mutexMemory);
     573             free(pageBuf);
     574             pageBuf=NULL;
     575             //pthread_mutex_unlock(&mutexMemory);
     576         }
     577         cout << "1.fcntl() error " << endl;
     578         return -1;
     579     }
     580     
     581     
     582     flags|=O_NONBLOCK;
     583     if(fcntl(sock,F_SETFL,flags)<0){
     584         close(sock);
     585         *nPSock = -1;
     586         if (pageBuf)
     587         {
     588             free(pageBuf); pageBuf=NULL;
     589         }
     590         cout << "2.fcntl() error " << endl;
     591         return -1;
     592     }
     593 
     594 
     595     //挂一个while()循环读取网页体信息
     596     int pre_ret=0;
     597     while(ret > 0)
     598     {
     599         FD_ZERO(&rfds);//清理rfds读文件描述符集合
     600         FD_SET(sock, &rfds);//将sock加到rfds读文件描述符集合中
     601         if( bytesRead == iPage.m_nContentLength )
     602         {
     603             tv.tv_sec = 1;
     604         }
     605         else
     606         {
     607             tv.tv_sec = timeout;
     608         }
     609         tv.tv_usec = 0;
     610 
     611         if(DEFAULT_TIMEOUT >= 0)
     612             selectRet = select(sock+1, &rfds, NULL, NULL, &tv);//IO复用
     613         else            /* No timeout, can block indefinately */
     614             selectRet = select(sock+1, &rfds, NULL, NULL, NULL);
     615 
     616         if(selectRet == 0 && timeout < 0)//超时
     617         {
     618             close(sock);
     619             *nPSock = -1;
     620             if (pageBuf)
     621             {
     622                 //pthread_mutex_lock(&mutexMemory);
     623                 free(pageBuf);
     624                 pageBuf=NULL;
     625                 //pthread_mutex_unlock(&mutexMemory);
     626             }
     627             cout << "selectRet == 0 && timeout < 0" << endl;
     628             return -1;
     629         } 
     630         else if(selectRet == -1)//select()函数出错
     631         {
     632             close(sock);
     633             *nPSock = -1;
     634             if (pageBuf)
     635             {
     636                 free(pageBuf);
     637                 pageBuf=NULL;
     638             }
     639             cout << "selectRet == -1" << endl;
     640             return -1;
     641         }
     642 
     643         //每次最多接收iPage.m_nContentLength字节--缓冲区的大小为iPage.m_nContentLength
     644         ret = read(sock, pageBuf + bytesRead, iPage.m_nContentLength);
     645         //ret = read(sock, (char*)pageBuf.c_str() + bytesRead, iPage.m_nContentLength);
     646 
     647         if(ret == 0) break;
     648         if(ret == -1 && pre_ret==0)//read()函数出错
     649         {
     650             close(sock);
     651             *nPSock = -1;
     652             if (pageBuf)
     653             {
     654                 //pthread_mutex_lock(&mutexMemory);
     655                 free(pageBuf); pageBuf=NULL;
     656                 //pthread_mutex_unlock(&mutexMemory);
     657             }
     658             cout << "read()'s retval=-1" << endl;
     659             return -1;
     660         }
     661         else if( ret == -1 && pre_ret )
     662         {
     663             //cout << "2. pre_ret = " << pre_ret << endl;
     664 /*
     665             if( bytesRead < iPage.m_nContentLength){    // meaning we lost the connection too soon
     666                 cout << "lost the connection too soon" << endl;
     667                 freeOpageBuf);
     668                 return -1;
     669             }
     670 */
     671             break;
     672         }
     673 
     674         pre_ret = ret;
     675         //cout << "1.pre_ret = " << pre_ret << endl;
     676 
     677         bytesRead += ret;
     678 
     679 
     680             /* To be tolerant of inaccurate Content-Length fields, we'll
     681              *      allocate another read-sized chunk to make sure we have
     682              *      enough room.
     683              */
     684         if(ret > 0) {
     685             //pthread_mutex_lock(&mutexMemory);
     686             pageBuf = (char *)realloc(pageBuf, bytesRead + iPage.m_nContentLength);
     687             //pthread_mutex_unlock(&mutexMemory);
     688             if(pageBuf == NULL) {
     689                 close(sock);
     690                 *nPSock = -1;
     691                 if (pageBuf)
     692                 {
     693                     //pthread_mutex_lock(&mutexMemory);
     694                     free(pageBuf); pageBuf=NULL;
     695                     //pthread_mutex_unlock(&mutexMemory);
     696                 }
     697                 cout << "realloc()" << endl;
     698                 return -1;
     699             }
     700         }
     701 
     702     }
     703 
     704     /*
     705      * The download buffer is too large.  Trim off the safety padding.
     706     */
     707 
     708     //pthread_mutex_lock(&mutexMemory);
     709     pageBuf = (char *)realloc(pageBuf, bytesRead+1);
     710     //pthread_mutex_unlock(&mutexMemory);
     711     if(pageBuf == NULL){
     712         close(sock);
     713         *nPSock = -1;
     714         if (pageBuf)
     715         {
     716             //pthread_mutex_lock(&mutexMemory);
     717             free(pageBuf); pageBuf=NULL;
     718             //pthread_mutex_unlock(&mutexMemory);
     719         }
     720         cout << "2.realloc()" << endl;
     721         return -1;
     722     }
     723 
     724 
     725     pageBuf[bytesRead] = '\0';
     726 
     727 
     728     if(fileBuf == NULL){    /* They just wanted us to "hit" the url */
     729         if (pageBuf)
     730         {
     731             //pthread_mutex_lock(&mutexMemory);
     732             free(pageBuf); pageBuf=NULL;
     733             //pthread_mutex_unlock(&mutexMemory);
     734         }
     735     }else{
     736 
     737 
     738 
     739         char *tmp;
     740         //tmp = (char *)malloc(HEADER_BUF_SIZE);
     741         //pthread_mutex_lock(&mutexMemory);
     742         tmp = (char *)malloc(strlen(headerBuf)+1);
     743         //pthread_mutex_unlock(&mutexMemory);
     744             if(tmp == NULL){
     745                     close(sock);
     746             *nPSock = -1;
     747             if (pageBuf)
     748             {
     749                 //pthread_mutex_lock(&mutexMemory);
     750                 free(pageBuf); pageBuf=NULL;
     751                 //pthread_mutex_unlock(&mutexMemory);
     752             }
     753             cout << "malloc() for headerBuf" << endl;
     754                     return -1;
     755             }
     756         //memcpy( tmp, headerBuf, HEADER_BUF_SIZE-1 );
     757         strncpy( tmp, headerBuf, strlen(headerBuf)+1 );
     758         *fileHeadBuf = tmp;
     759 
     760         *fileBuf = pageBuf;
     761     }
     762         
     763     //close(sock);
     764     *nPSock = sock;
     765     return bytesRead;
     766 }
     767     
     768 
     769 
     770 
     771 
     772 /*
     773 
     774 function: 创建套接字文件描述符,并且调用nonb_connect()同目标服务器进行连接
     775 
     776 success:  return sock[成功创建的套接子文件描述符]
     777 
     778 fail:     return -1  其他错误
     779 
     780           return -2  在IP阻塞范围内
     781 
     782 */
     783 int CHttp::CreateSocket(const char *host, int port)
     784 {
     785     int sock;        // Socket descriptor
     786     struct sockaddr_in sa;    // Socket address
     787 
     788 
     789     unsigned long   inaddr;
     790     int ret;
     791 
     792     CUrl url;
     793     char *ip = url.GetIpByHost(host);//通过主机号得到IP地址
     794 
     795     if( ip == NULL )//获得失败
     796     { // gethostbyname() error in GetIpByHost()
     797         //cout << "invalid host: " << host << endl;
     798         return -1;
     799 
     800     } 
     801     else 
     802     {
     803         // filter ip (decide whether it is inside the ip block)
     804         if( url.IsValidIp(ip) )//在IP阻塞范围内
     805         {
     806             // inside
     807             inaddr = (unsigned long)inet_addr(ip);//将字符串IP转化为32位的网络字节序
     808 
     809             if( inaddr == INADDR_NONE )
     810             {
     811                 // release the buffer, be careful
     812                 //pthread_mutex_lock(&mutexMemory);
     813                 delete [] ip; ip = NULL;
     814                 //pthread_mutex_unlock(&mutexMemory);
     815                 cout << "invalid ip " << ip << endl;
     816                 return -1;
     817             }
     818 
     819             memcpy((char *)&sa.sin_addr, (char *)&inaddr, sizeof(inaddr));
     820 
     821             // release the buffer, be carful
     822             //pthread_mutex_lock(&mutexMemory);
     823             delete [] ip; ip = NULL;
     824             //pthread_mutex_unlock(&mutexMemory);
     825 
     826         } 
     827         else//在IP阻塞范围外
     828         { // out of ip block
     829             // release the buffer, be carful
     830             //pthread_mutex_lock(&mutexMemory);
     831             delete [] ip; ip = NULL;
     832             //pthread_mutex_unlock(&mutexMemory);
     833             //cout << "out of ip block: " << host << endl;
     834             return -2;
     835         }
     836     }
     837 
     838 
     839     /* Copy host address from hostent to (server) socket address */
     840     sa.sin_family = AF_INET;        
     841     sa.sin_port = htons(port);    /* Put portnum into sockaddr */
     842 
     843     sock = -1;
     844     sock = socket(AF_INET, SOCK_STREAM, 0);//创建套接字文件描述符
     845     if(sock < 0 ) //创建失败
     846     { 
     847         cout << "socket() in CreateSocket" << endl;
     848         return -1;
     849     }
     850 
     851     int optval = 1;
     852     if (setsockopt (sock, SOL_SOCKET, SO_REUSEADDR,(char *)&optval, sizeof (optval)) < 0)
     853         //SOL_SOCKET 通用套接字选项
     854         //SO_REUSEADDR 表示允许本地地址重用
     855     {
     856 
     857         cout << "setsockopt() in CreateSocket" << endl;
     858         close(sock);
     859         return -1;
     860     }
     861 
     862         //ret = connect(sock, (struct sockaddr *)&sa, sizeof(sa));
     863         ret = nonb_connect(sock, (struct sockaddr *)&sa, DEFAULT_TIMEOUT);
     864         if(ret == -1) { 
     865         cout << "nonb_connect() in CreateSocket" << endl;
     866         close(sock);
     867         return -1; 
     868     }
     869 
     870         return sock;//返回套接字文件描述符
     871 }
     872 
     873 
     874 /* function:通过IO复用的方法读取网页头信息
     875  success: return  bytesRead[网页头信息的真实长度]
     876  fail:    return  -1
     877  */
     878 int CHttp::read_header(int sock, char *headerPtr)
     879 {
     880     fd_set rfds;//读文件描述符集合
     881     struct timeval tv;
     882     int bytesRead = 0, newlines = 0, ret, selectRet;
     883 
     884     int flags;
     885 
     886     flags=fcntl(sock,F_GETFL,0);//将sock套接子文件描述符设置为非阻塞方式
     887     if(flags<0)
     888     {
     889         cout << "1.fcntl() in read_header()< 0" << endl;
     890         return -1;
     891     }
     892     
     893     flags|=O_NONBLOCK;
     894     if(fcntl(sock,F_SETFL,flags)<0)
     895     {
     896         cout << "2.fcntl() < 0 in read_header()" << endl;
     897         return -1;
     898     }
     899 
     900     //挂一个while()循环来读取网页头信息
     901     while(newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
     902     {
     903         FD_ZERO(&rfds);//清理读文件描述符集合
     904         FD_SET(sock, &rfds);//将套接字文件描述符加到读文件描述符集合中
     905         tv.tv_sec = timeout;//设置最长的等待时间
     906         tv.tv_usec = 0;
     907 
     908         if(timeout >= 0)
     909             selectRet = select(sock+1, &rfds, NULL, NULL, &tv);
     910         else   //最一个参数设置为NULL,表示阻塞操作会一直等待,直到莫个监视的文件集合中的某个文件描述符符合返回条件
     911             selectRet = select(sock+1, &rfds, NULL, NULL, NULL);
     912 
     913         if(selectRet == 0 && timeout < 0) 
     914         {
     915             cout << "selectRet == 0 && timeout < 0" << endl;
     916             return -1;
     917         }
     918         else if(selectRet == -1) //select()出错
     919         {
     920             cout << "selectRet == 0 && timeout < 0 else" << endl;
     921             return -1;
     922         }
     923 
     924         ret = read(sock, headerPtr, 1);
     925         if(ret == -1)
     926         {
     927             cout << "!error: read() in read_header()" << endl;
     928             return -1;
     929         }
     930 
     931         bytesRead++;
     932                 
     933         if(*headerPtr == '\r')
     934         {                 /* Ignore CR */
     935             /* Basically do nothing special, just don't set newlines
     936              *      to 0 */
     937             headerPtr++;
     938             continue;
     939         }
     940         else if(*headerPtr == '\n')             /* LF is the separator */
     941             newlines++;
     942         else    
     943             newlines = 0;
     944                 
     945         headerPtr++;
     946 
     947     }
     948         
     949     //headerPtr -= 3;         /* Snip the trailing LF's */
     950                   /* to be compatible with Tianwang format, we have to retain them*/
     951     headerPtr -= 2;
     952     *headerPtr = '\0';
     953     //cout << "in it " << headerPtr << endl;
     954     return bytesRead;
     955 }
     956 
     957 
     958 
     959 /*
     960  function:被CreateSocket()调用,通过IO复用的方法连接目标服务器
     961  success: return 0;
     962  fail:    return -1;
     963  sockfd:  套接子文件描述符
     964  sa:      服务器套接子地址结构
     965  sec:     最长的等待时间
     966  */
     967 int CHttp::nonb_connect(int sockfd,struct sockaddr* sa,int sec)
     968 {
     969     int flags;
     970     int status;
     971     fd_set mask;//写文件描述符集合
     972     struct timeval timeout;
     973 
     974     //set the socket as nonblocking
     975     flags=fcntl(sockfd,F_GETFL,0);//将套接子文件描述符设置为非阻塞方式
     976 
     977     if(flags<0) return -1;
     978     flags|=O_NONBLOCK;//设置非阻塞方式
     979     if(fcntl(sockfd,F_SETFL,flags) < 0)
     980     {
     981         cout << "1.fcntl() in nonb_connect" << endl;
     982         return -1;
     983     }
     984 
     985     if( connect(sockfd,sa,sizeof(struct sockaddr)) == 0)//立刻连接上了
     986     {
     987         flags&=~O_NONBLOCK;//因为上面已经设置了非阻塞方式,所以我们这里有必要重新设置阻塞方式--相当于复位
     988         fcntl(sockfd,F_SETFL,flags);
     989         return sockfd;//connected immediately
     990         }
     991 
     992     FD_ZERO(&mask);//清理写文件描述符集合mask
     993     FD_SET(sockfd,&mask);//将sockfd套接字文件描述符加到文件描述符集合mask中
     994     timeout.tv_sec=sec;//设置最长的等待时间
     995     timeout.tv_usec=0;
     996     status=select(sockfd+1,NULL,&mask,NULL,&timeout);//IO复用
     997 
     998     switch(status){
     999         case -1:        // Select error, set the socket as default blocking  //select()出错
    1000             flags&=~O_NONBLOCK;
    1001             fcntl(sockfd,F_SETFL,flags);
    1002             cout << "2.fcntl() in nonb_connect" << endl;
    1003             return -1;
    1004         case 0:         //Connection timed out.//连接超时
    1005             flags&=~O_NONBLOCK;
    1006             fcntl(sockfd,F_SETFL,flags);
    1007             cout << "3.fcntl() in nonb_connect" << endl;
    1008             return -1;
    1009         default:         // Connected successfully.//连接成功
    1010             FD_CLR(sockfd,&mask);
    1011             flags&=~O_NONBLOCK;
    1012             fcntl(sockfd,F_SETFL,flags);
    1013             return 0;
    1014     }
    1015 }
    1016 
    1017 /*
    1018 function: 检测*buf所指的内存空间剩余值是否大于more,不过再加more+1单位的内存空间
    1019 success:  return 0;
    1020 fail:     return -1;
    1021 */
    1022 int CHttp::checkBufSize(char **buf, int *bufsize, int more)
    1023 {
    1024     char *tmp;
    1025     int roomLeft = *bufsize - (strlen(*buf) + 1);//*buf内存空间的剩余值
    1026 
    1027     if(roomLeft > more) return 0;//剩余值大于more返回0
    1028 
    1029     //pthread_mutex_lock(&mutexMemory);
    1030     tmp = (char *)realloc(*buf, *bufsize + more + 1);//剩余值不够more,这个时候我们要调整内存空间的长度,长度加more+1
    1031     //pthread_mutex_unlock(&mutexMemory);
    1032     if(tmp == NULL) return -1;//没有调整成功返回-1
    1033 
    1034     *buf = tmp;
    1035     *bufsize += more + 1;
    1036     return 0;//调整成功
    1037 }
  • 相关阅读:
    机器学习(二)分类决策树
    机器学习(一):朴素贝叶斯
    NLP(五):关键词提取补充(语料库和向量空间)
    oracle变量的定义和使用【转】
    一些开发技巧和工具使用心得
    醒狮赋
    C# 字符串的截取和替换
    C# 判断字符串是否为日期格式
    重装系统
    电脑桌面所有图标消失
  • 原文地址:https://www.cnblogs.com/kakamilan/p/2578889.html
Copyright © 2011-2022 走看看