转自:http://blog.csdn.net/huangxy10/article/details/8120106
备注:把项目属性中的字符集改成多字节集合?
1 // 网络爬虫.cpp : 定义控制台应用程序的入口点。 2 // 3 4 #include "stdafx.h" 5 /* 6 7 int _tmain(int argc, _TCHAR* argv[]) 8 { 9 return 0; 10 } 11 12 */ 13 14 //#include <Windows.h> 15 #include <string> 16 #include <iostream> 17 #include <fstream> 18 #include <vector> 19 #include "winsock2.h" 20 #include <time.h> 21 #include <queue> 22 #include <hash_set> 23 24 #pragma comment(lib, "ws2_32.lib") 25 using namespace std; 26 27 #define DEFAULT_PAGE_BUF_SIZE 1048576 28 29 queue<string> hrefUrl; 30 hash_set<string> visitedUrl; 31 hash_set<string> visitedImg; 32 int depth=0; 33 int g_ImgCnt=1; 34 35 //解析URL,解析出主机名,资源名 36 bool ParseURL( const string & url, string & host, string & resource){ 37 if ( strlen(url.c_str()) > 2000 ) { 38 return false; 39 } 40 41 const char * pos = strstr( url.c_str(), "http://" ); 42 if( pos==NULL ) pos = url.c_str(); 43 else pos += strlen("http://"); 44 if( strstr( pos, "/")==0 ) 45 return false; 46 char pHost[100]; 47 char pResource[2000]; 48 sscanf( pos, "%[^/]%s", pHost, pResource ); 49 host = pHost; 50 resource = pResource; 51 return true; 52 } 53 54 //使用Get请求,得到响应 55 bool GetHttpResponse( const string & url, char * &response, int &bytesRead ){ 56 string host, resource; 57 if(!ParseURL( url, host, resource )){ 58 cout << "Can not parse the url"<<endl; 59 return false; 60 } 61 62 //建立socket 63 struct hostent * hp= gethostbyname( host.c_str() ); 64 if( hp==NULL ){ 65 cout<< "Can not find host address"<<endl; 66 return false; 67 } 68 69 SOCKET sock = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP); 70 if( sock == -1 || sock == -2 ){ 71 cout << "Can not create sock."<<endl; 72 return false; 73 } 74 75 //建立服务器地址 76 SOCKADDR_IN sa; 77 sa.sin_family = AF_INET; 78 sa.sin_port = htons( 80 ); 79 //char addr[5]; 80 //memcpy( addr, hp->h_addr, 4 ); 81 //sa.sin_addr.s_addr = inet_addr(hp->h_addr); 82 memcpy( &sa.sin_addr, hp->h_addr, 4 ); 83 84 //建立连接 85 if( 0!= connect( sock, (SOCKADDR*)&sa, sizeof(sa) ) ){ 86 cout << "Can not connect: "<< url <<endl; 87 closesocket(sock); 88 return false; 89 }; 90 91 //准备发送数据 92 string request = "GET " + resource + " HTTP/1.1 Host:" + host + " Connection:Close "; 93 94 //发送数据 95 if( SOCKET_ERROR ==send( sock, request.c_str(), request.size(), 0 ) ){ 96 cout << "send error" <<endl; 97 closesocket( sock ); 98 return false; 99 } 100 101 //接收数据 102 int m_nContentLength = DEFAULT_PAGE_BUF_SIZE; 103 char *pageBuf = (char *)malloc(m_nContentLength); 104 memset(pageBuf, 0, m_nContentLength); 105 106 bytesRead = 0; 107 int ret = 1; 108 cout <<"Read: "; 109 while(ret > 0){ 110 ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0); 111 112 if(ret > 0) 113 { 114 bytesRead += ret; 115 } 116 117 if( m_nContentLength - bytesRead<100){ 118 cout << " Realloc memorry"<<endl; 119 m_nContentLength *=2; 120 pageBuf = (char*)realloc( pageBuf, m_nContentLength); //重新分配内存 121 } 122 cout << ret <<" "; 123 } 124 cout <<endl; 125 126 pageBuf[bytesRead] = '