zoukankan      html  css  js  c++  java
  • 用C++爬取网页

              做了好几天,终于写出来了,以前没有想到过,用C++也可以爬取网页,经过这么多天的努力终于做好了,解决了乱码问题。

    从中学到很多,小到一个函数的参数,达到如何使用一个函数。

               还有C++中一直让人头疼的编码问题,unicode编码问题,研究了很多资料,又对Mutibytetowidechar和widechartomultibyte进行了重新的认识。

    一个重要的关键是windows默认的是ANSI字符集,同时对HTML的格式进行了分析,以判断编码问题。

               感觉那么多天的辛苦没有白费,付出有了收获。不过在此,真的感谢那些牛人,期间也参考了他们的代码。

     代码:

    #include <iostream>
    #include <winsock2.h>
    #include <cstring>
    #include <fstream>
    #pragma comment(lib,"ws2_32.lib")
    
    using namespace std;
    
    void getWebPage(char *url)
    {
    	SOCKET sock;
    	WSADATA wsa;
    	struct sockaddr_in  addrclient;
    	ofstream of;
    	WSAStartup(MAKEWORD(2,2),&wsa);
    	of.open("temp.txt");
    	if(!of)
    	{
    		cout<<"open fail!"<<endl;
    		return;
    	}
    	static char content[100000]="";
    	char myurl[256];
    	char host[256];
    	char dom[256];
    	char header[256];
    	char type[512];
    	char *p;
    	memset(myurl,'\0',256);
    	memset(host,'\0',256);
    	memset(dom,'\0',256);
    	memset(header,'\0',256);
    	memset(type,'\0',512);
    	char *purl=0;
    	struct hostent *phost;
    	sock=socket(PF_INET,SOCK_STREAM,IPPROTO_TCP);
    
    	strcpy(myurl,url);
    	for(purl=myurl;*purl!='/'&&purl!='\0';++purl);
    	if(int(purl-myurl)==strlen(myurl))
    		strcpy(host,"/");
    	else
    		strcpy(host,purl);
    	*purl='\0';
    	strcpy(dom,myurl);
    
    	cout<<dom<<endl;          //输出域名
    	cout<<host<<endl;     //输出地址
    	of<<dom<<endl;
    	of<<host<<endl;
    	phost=gethostbyname(dom);
    		
    	addrclient.sin_family=AF_INET;
    	addrclient.sin_port=htons(80);
    	addrclient.sin_addr.S_un.S_addr=*((unsigned long *)phost->h_addr);
    	
    	connect(sock,(struct sockaddr*)&addrclient,sizeof(addrclient));
    	
    	strcat(header, "GET "); 
        strcat(header, host); 
        strcat(header, " HTTP/1.1\r\n"); 
        strcat(header, "Host: "); 
        strcat(header, dom); 
        strcat(header, "\r\nConnection: Close\r\n\r\n"); 
    	send(sock,header,strlen(header),0);
    	recv(sock,type,512,0);
    	cout<<type<<endl;
    	of<<type;
    	p=strstr(type,"utf-8");
    	if(p)
    	{
    	memset(content,'\0',100000);
    	while(recv(sock,content,100000,0)>0)
    	{
    		int len=MultiByteToWideChar(CP_UTF8, 0, content, -1, NULL,0);
            unsigned short * wszGBK = new unsigned short[len+1];
            memset(wszGBK, 0, len * 2 + 2);
            MultiByteToWideChar(CP_UTF8, 0, content, -1, (LPWSTR)wszGBK, len);
    		len = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)wszGBK, -1, NULL, 0, NULL, NULL);  
            char *szGBK=new char[len + 1];
            memset(szGBK, 0, len + 1);
            WideCharToMultiByte (CP_ACP, 0, (LPCWSTR)wszGBK, -1, szGBK, len, NULL,NULL);
    		cout<<szGBK;
    		of<<szGBK;
    		strnset(content,'\0',100000);
    		delete []wszGBK;
    		delete [] szGBK;
    	}
    	}
    	else
    	{
    		memset(type,'\0',512);
    		recv(sock,type,512,0);
    		cout<<type;
    		of<<type;
    		p=strstr(type,"gb2312");
    		if(p)
    		{
    			while(recv(sock,content,100000,0))
    			{
    				cout<<content;
    				of<<content;
    				strnset(content,'\0',100000);
    			}
    		}
    		else
    		{
               while(recv(sock,content,100000,0)>0)
    	       {
    		       int len=MultiByteToWideChar(CP_UTF8, 0, content, -1, NULL,0);
                   unsigned short * wszGBK = new unsigned short[len+1];
                   memset(wszGBK, 0, len * 2 + 2);
                   MultiByteToWideChar(CP_UTF8, 0, content, -1, (LPWSTR)wszGBK, len);
    		       len = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)wszGBK, -1, NULL, 0, NULL, NULL);  
                   char *szGBK=new char[len + 1];
                   memset(szGBK, 0, len + 1);
                   WideCharToMultiByte (CP_ACP, 0, (LPCWSTR)wszGBK, -1, szGBK, len, NULL,NULL);
    		       cout<<szGBK;
    		       of<<szGBK;
    		       strnset(content,'\0',100000);
    		       delete []wszGBK;
    		       delete [] szGBK;
    	       }
    		}
    	}
    	closesocket(sock); 
        WSACleanup();
    	of.close();
    	cout<<endl;
    }
    int main()
    {
    	char url[256];
    	cout<<"http://";
    	cin>>url;
    	getWebPage(url);
    	return 0;
    }
    

     对此,又对socket编程产生了兴趣,socket编程魅力无穷。

  • 相关阅读:
    线性表的实现用通用方法实现线性表的初始化、求表长、插入元素、删除元素等
    用c++定义两个坐标点,计算两点间距离;进而计算线段的面积
    Java:学生信息的录入,各种排序,对文件的操作
    数组1 2 3 4 5 6 1(输入-1结束),奇数位的数逆序,偶数位数不变
    按层次遍历二叉树,用队列作为缓冲
    Chapter09"内核模式下的线程同步"之事件内核对象
    Chapter10“I/O设备的同步和异步”之打开和关闭设备
    CSDN博客积分系统
    探秘Java垃圾回收机制
    Chapter09“内核模式下的线程同步”之可等待的计时器内核对象
  • 原文地址:https://www.cnblogs.com/xshang/p/3097589.html
Copyright © 2011-2022 走看看