zoukankan      html  css  js  c++  java
  • C++ 网络爬虫实现

    最近有个概念吵得很火,网络爬虫,但是基本都是用什么python或者JAVA写,貌似很少看到用c++写的,我在网上找了一个,看到其实还是很简单的算法

    算法讲解:1.遍历资源网站

         2.获取html信息

           3.然后解析网址和图片url下载。

           4.递归调用搜索网址

    BFS是最重要的处理:
     
           先是获取网页响应,保存到文本里面,然后找到其中的图片链接HTMLParse,

           下载所有图片DownLoadImg。

    //广度遍历  
    void BFS( const string & url ){  
    	char * response;  
    	int bytes;  
    	// 获取网页的相应,放入response中。  
    	if( !GetHttpResponse( url, response, bytes ) ){  
    		cout << "The url is wrong! ignore." << endl;  
    		return;  
    	}  
    	string httpResponse=response;  
    	free( response );  
    	string filename = ToFileName( url );  
    	ofstream ofile( "./html/"+filename );  
    	if( ofile.is_open() ){  
    		// 保存该网页的文本内容  
    		ofile << httpResponse << endl;  
    		ofile.close();  
    	}  
    	vector<string> imgurls;  
    	//解析该网页的所有图片链接,放入imgurls里面  
    	HTMLParse( httpResponse,  imgurls, url );  
    
    	//下载所有的图片资源  
    	DownLoadImg( imgurls, url );  
    }  
    

    然后附上代码:

    #include "stdafx.h"
    
    //#include <Windows.h>  
    #include <string>  
    #include <iostream>  
    #include <fstream>  
    #include <vector>  
    #include "winsock2.h"  
    #include <time.h>  
    #include <queue>  
    #include <hash_set>  
    
    #pragma comment(lib, "ws2_32.lib")   
    using namespace std;  
    
    #define DEFAULT_PAGE_BUF_SIZE 1048576  
    
    queue<string> hrefUrl;  
    hash_set<string> visitedUrl;  
    hash_set<string> visitedImg;  
    int depth=0;  
    int g_ImgCnt=1;  
    
    //解析URL,解析出主机名,资源名  
    bool ParseURL( const string & url, string & host, string & resource){  
    	if ( strlen(url.c_str()) > 2000 ) {  
    		return false;  
    	}  
    
    	const char * pos = strstr( url.c_str(), "http://" );  
    	if( pos==NULL ) pos = url.c_str();  
    	else pos += strlen("http://");  
    	if( strstr( pos, "/")==0 )  
    		return false;  
    	char pHost[100];  
    	char pResource[2000];  
    	sscanf( pos, "%[^/]%s", pHost, pResource );  
    	host = pHost;  
    	resource = pResource;  
    	return true;  
    }  
    
    //使用Get请求,得到响应  
    bool GetHttpResponse( const string & url, char * &response, int &bytesRead ){  
    	string host, resource;  
    	if(!ParseURL( url, host, resource )){  
    		cout << "Can not parse the url"<<endl;  
    		return false;  
    	}  
    
    	//建立socket  
    	struct hostent * hp= gethostbyname( host.c_str() );  
    	if( hp==NULL ){  
    		cout<< "Can not find host address"<<endl;  
    		return false;  
    	}  
    
    	SOCKET sock = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP);  
    	if( sock == -1 || sock == -2 ){  
    		cout << "Can not create sock."<<endl;  
    		return false;  
    	}  
    
    	//建立服务器地址  
    	SOCKADDR_IN sa;  
    	sa.sin_family = AF_INET;  
    	sa.sin_port = htons( 80 );  
    	//char addr[5];  
    	//memcpy( addr, hp->h_addr, 4 );  
    	//sa.sin_addr.s_addr = inet_addr(hp->h_addr);  
    	memcpy( &sa.sin_addr, hp->h_addr, 4 );  
    
    	//建立连接  
    	if( 0!= connect( sock, (SOCKADDR*)&sa, sizeof(sa) ) ){  
    		cout << "Can not connect: "<< url <<endl;  
    		closesocket(sock);  
    		return false;  
    	};  
    
    	//准备发送数据  
    	string request = "GET " + resource + " HTTP/1.1
    Host:" + host + "
    Connection:Close
    
    ";  
    
    	//发送数据  
    	if( SOCKET_ERROR ==send( sock, request.c_str(), request.size(), 0 ) ){  
    		cout << "send error" <<endl;  
    		closesocket( sock );  
    		return false;  
    	}  
    
    	//接收数据  
    	int m_nContentLength = DEFAULT_PAGE_BUF_SIZE;  
    	char *pageBuf = (char *)malloc(m_nContentLength);  
    	memset(pageBuf, 0, m_nContentLength);  
    
    	bytesRead = 0;  
    	int ret = 1;  
    	cout <<"Read: ";  
    	while(ret > 0){  
    		ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0);  
    
    		if(ret > 0)  
    		{  
    			bytesRead += ret;  
    		}  
    
    		if( m_nContentLength - bytesRead<100){  
    			cout << "
    Realloc memorry"<<endl;  
    			m_nContentLength *=2;  
    			pageBuf = (char*)realloc( pageBuf, m_nContentLength);       //重新分配内存  
    		}  
    		cout << ret <<" ";  
    	}  
    	cout <<endl;  
    
    	pageBuf[bytesRead] = '';  
    	response = pageBuf;  
    	closesocket( sock );  
    	return true;  
    	//cout<< response <<endl;  
    }  
    
    //提取所有的URL以及图片URL  
    void HTMLParse ( string & htmlResponse, vector<string> & imgurls, const string & host ){  
    	//找所有连接,加入queue中  
    	const char *p= htmlResponse.c_str();  
    	char *tag="href="";  
    	const char *pos = strstr( p, tag );  
    	ofstream ofile("url.txt", ios::app);  
    	while( pos ){  
    		pos +=strlen(tag);  
    		const char * nextQ = strstr( pos, """ );  
    		if( nextQ ){  
    			char * url = new char[ nextQ-pos+1 ];  
    			//char url[100]; //固定大小的会发生缓冲区溢出的危险  
    			sscanf( pos, "%[^"]", url);  
    			string surl = url;  // 转换成string类型,可以自动释放内存  
    			if( visitedUrl.find( surl ) == visitedUrl.end() ){  
    				visitedUrl.insert( surl );  
    				ofile << surl<<endl;  
    				hrefUrl.push( surl );  
    			}  
    			pos = strstr(pos, tag );  
    			delete [] url;  // 释放掉申请的内存  
    		}  
    	}  
    	ofile << endl << endl;  
    	ofile.close();  
    
    	tag ="<img ";  
    	const char* att1= "src="";  
    	const char* att2="lazy-src="";  
    	const char *pos0 = strstr( p, tag );  
    	while( pos0 ){  
    		pos0 += strlen( tag );  
    		const char* pos2 = strstr( pos0, att2 );  
    		if( !pos2 || pos2 > strstr( pos0, ">") ) {  
    			pos = strstr( pos0, att1);  
    			if(!pos) {  
    				pos0 = strstr(att1, tag );  
    				continue;  
    			} else {  
    				pos = pos + strlen(att1);  
    			}  
    		}  
    		else {  
    			pos = pos2 + strlen(att2);  
    		}  
    
    		const char * nextQ = strstr( pos, """);  
    		if( nextQ ){  
    			char * url = new char[nextQ-pos+1];  
    			sscanf( pos, "%[^"]", url);  
    			cout << url<<endl;  
    			string imgUrl = url;  
    			if( visitedImg.find( imgUrl ) == visitedImg.end() ){  
    				visitedImg.insert( imgUrl );  
    				imgurls.push_back( imgUrl );  
    			}  
    			pos0 = strstr(pos0, tag );  
    			delete [] url;  
    		}  
    	}  
    	cout << "end of Parse this html"<<endl;  
    }  
    
    //把URL转化为文件名  
    string ToFileName( const string &url ){  
    	string fileName;  
    	fileName.resize( url.size());  
    	int k=0;  
    	for( int i=0; i<(int)url.size(); i++){  
    		char ch = url[i];  
    		if( ch!='\'&&ch!='/'&&ch!=':'&&ch!='*'&&ch!='?'&&ch!='"'&&ch!='<'&&ch!='>'&&ch!='|')  
    			fileName[k++]=ch;  
    	}  
    	return fileName.substr(0,k) + ".txt";  
    }  
    
    //下载图片到img文件夹  
    void DownLoadImg( vector<string> & imgurls, const string &url ){  
    
    	//生成保存该url下图片的文件夹  
    	string foldname = ToFileName( url );  
    	foldname = "./img/"+foldname;  
    	if(!CreateDirectory( foldname.c_str(),NULL ))  
    		cout << "Can not create directory:"<< foldname<<endl;  
    	char *image;  
    	int byteRead;  
    	for( int i=0; i<imgurls.size(); i++){  
    		//判断是否为图片,bmp,jgp,jpeg,gif   
    		string str = imgurls[i];  
    		int pos = str.find_last_of(".");  
    		if( pos == string::npos )  
    			continue;  
    		else{  
    			string ext = str.substr( pos+1, str.size()-pos-1 );  
    			if( ext!="bmp"&& ext!="jpg" && ext!="jpeg"&& ext!="gif"&&ext!="png")  
    				continue;  
    		}  
    		//下载其中的内容  
    		if( GetHttpResponse(imgurls[i], image, byteRead)){  
    			if ( strlen(image) ==0 ) {  
    				continue;  
    			}  
    			const char *p=image;  
    			const char * pos = strstr(p,"
    
    ")+strlen("
    
    ");  
    			int index = imgurls[i].find_last_of("/");  
    			if( index!=string::npos ){  
    				string imgname = imgurls[i].substr( index , imgurls[i].size() );  
    				ofstream ofile( foldname+imgname, ios::binary );  
    				if( !ofile.is_open() )  
    					continue;  
    				cout <<g_ImgCnt++<< foldname+imgname<<endl;  
    				ofile.write( pos, byteRead- (pos-p) );  
    				ofile.close();  
    			}  
    			free(image);  
    		}  
    	}  
    }  
    
    
    
    //广度遍历  
    void BFS( const string & url ){  
    	char * response;  
    	int bytes;  
    	// 获取网页的相应,放入response中。  
    	if( !GetHttpResponse( url, response, bytes ) ){  
    		cout << "The url is wrong! ignore." << endl;  
    		return;  
    	}  
    	string httpResponse=response;  
    	free( response );  
    	string filename = ToFileName( url );  
    	ofstream ofile( "./html/"+filename );  
    	if( ofile.is_open() ){  
    		// 保存该网页的文本内容  
    		ofile << httpResponse << endl;  
    		ofile.close();  
    	}  
    	vector<string> imgurls;  
    	//解析该网页的所有图片链接,放入imgurls里面  
    	HTMLParse( httpResponse,  imgurls, url );  
    
    	//下载所有的图片资源  
    	DownLoadImg( imgurls, url );  
    }  
    
    void main()  
    {  
    	//初始化socket,用于tcp网络连接  
    	WSADATA wsaData;  
    	if( WSAStartup(MAKEWORD(2,2), &wsaData) != 0 ){  
    		return;  
    	}  
    
    	// 创建文件夹,保存图片和网页文本文件  
    	CreateDirectory( "./img",0);  
    	CreateDirectory("./html",0);  
    	//string urlStart = "http://hao.360.cn/meinvdaohang.html";  
    
    	// 遍历的起始地址  
    	 string urlStart = "http://desk.zol.com.cn/bizhi/7018_87137_2.html";  
    	//string urlStart = "http://item.taobao.com/item.htm?spm=a230r.1.14.19.sBBNbz&id=36366887850&ns=1#detail";  
    
    	// 使用广度遍历  
    	// 提取网页中的超链接放入hrefUrl中,提取图片链接,下载图片。  
    	BFS( urlStart );  
    
    	// 访问过的网址保存起来  
    	visitedUrl.insert( urlStart );  
    
    	while( hrefUrl.size()!=0 ){  
    		string url = hrefUrl.front();  // 从队列的最开始取出一个网址  
    		cout << url << endl;  
    		BFS( url );                   // 遍历提取出来的那个网页,找它里面的超链接网页放入hrefUrl,下载它里面的文本,图片  
    		hrefUrl.pop();                 // 遍历完之后,删除这个网址  
    	}  
    	WSACleanup();  
    	return;  
    }  


      

  • 相关阅读:
    Spark2.4.5集群安装与本地开发
    Windows玩转Kubernetes系列4-搭建K8S Dashboard
    Windows玩转Kubernetes系列3-Centos安装K8S
    Windows玩转Kubernetes系列2-Centos安装Docker
    Windows玩转Kubernetes系列1-VirtualBox安装Centos
    Lock wait timeout exceeded?代码该优化了
    RocketMQ初入门踩坑记
    Java8虚拟机(JVM)内存溢出实战
    CentOS 7 下 JDK1.8+Maven+Nginx+MySql+Git+Redis环境安装
    消息中间件—SpringBoot下RabbitMQ实战
  • 原文地址:https://www.cnblogs.com/alexhg/p/6656130.html
Copyright © 2011-2022 走看看