zoukankan      html  css  js  c++  java
  • 网络爬虫WebCrawler(1)-Http网页内容抓取

    在windows在下面C++由Http协议抓取网页的内容:

        首先介绍了两个重要的包(平时linux在开源包,在windows下一个被称为动态链接库dll):curl包和pthreads_dll,其中curl包解释为命令行浏览器。通过调用内置的curl_easy_setopt等函数就可以实现特定的网页内容获取(正确的编译导入的curl链接库,还须要另外一个包C-ares)。pthreads是多线程控制包,其中包括了相互排斥变量加锁和解锁。

    程序进程分配等函数。

    下载地址:点击打开链接。当中要正确的导入外接动态链接库,须要步骤:1,项目->属性->配置属性->C/C++->常规->附加包括文件夹(加入include的路径),2。项目->属性->配置属性->连接器->常规->附加库文件夹(加入lib包括的路径);3,在链接器->输入->附加依赖项(libcurld.lib ;pthreadVC2.lib;ws2_32.lib。winmm.lib;wldap32.lib;areslib.lib加入)4,在c/c++->预处理器->预处理器定义(_CONSOLE;BUILDING_LIBCURL;HTTP_ONLY)

        详细实现过程介绍:

    1:自己定义hashTable结构。用以存储获取的string字符。以hashTable类的形式实现。包括hash表set类型,以及add、find和几种常见的string哈希方式函数

    Code:

    ///HashTable.h
    #ifndef HashTable_H
    #define HashTable_H
    
    #include <set>
    #include <string>
    #include <vector>
    
    class HashTable
    {
    public:
    	HashTable(void);
    	~HashTable(void);
    	unsigned int ForceAdd(const std::string& str);
    	unsigned int Find(const std::string& str);
    
    	/*string的常见的hash方式*/
    	unsigned int RSHash(const std::string& str);
    	unsigned int JSHash  (const std::string& str);
        unsigned int PJWHash (const std::string& str);
        unsigned int ELFHash (const std::string& str);
        unsigned int BKDRHash(const std::string& str);
        unsigned int SDBMHash(const std::string& str);
        unsigned int DJBHash (const std::string& str);
        unsigned int DEKHash (const std::string& str);
        unsigned int BPHash  (const std::string& str);
        unsigned int FNVHash (const std::string& str);
        unsigned int APHash  (const std::string& str);
    
    private:
    	std::set<unsigned int> HashFunctionResultSet;
    	std::vector<unsigned int> hhh;
    };
    #endif
    
    /////HashTable.cpp
    #include "HashTable.h"
    
    
    HashTable::HashTable(void)
    {
    }
    
    
    HashTable::~HashTable(void)
    {
    }
    
    
    unsigned int HashTable::ForceAdd(const std::string& str)
    {
    	unsigned int i=ELFHash(str);
    	HashFunctionResultSet.insert(i);
    	return i;
    }
    
    
    unsigned int HashTable::Find(const std::string& str)
    {
    	int ff=hhh.size();
    	const unsigned int i=ELFHash(str);
    	std::set<unsigned int>::const_iterator it;
    	if(HashFunctionResultSet.size()>0)
    	{
    		it=HashFunctionResultSet.find(i);
    		if(it==HashFunctionResultSet.end())
    			return -1;
    	}
    	else
    	{
    		return -1;
    	}
    	return i;
    }
    
    
    /*几种常见的字符串hash方式实现函数*/
    unsigned int HashTable::APHash(const std::string& str)
    {
    	unsigned int hash=0xAAAAAAAA;
    	for(std::size_t i=0;i<str.length();i++)
    	{
    		hash^=((i & 1) == 0) ? (  (hash <<  7) ^ str[i] * (hash >> 3)) :
                                   (~((hash << 11) + str[i] ^ (hash >> 5)));
    	}
    	return hash;
    }
    
    unsigned int HashTable::BKDRHash(const std::string& str)
    {
    	unsigned int seed=131;   //31 131 1313 13131 131313 etc
    	unsigned int hash=0;
    	for(std::size_t i=0;i<str.length();i++)
    	{
    		hash=(hash*seed)+str[i];
    	}
    	return hash;
    }
    
    unsigned int HashTable::BPHash(const std::string& str)
    {
    	unsigned int hash = 0;
    	for(std::size_t i = 0; i < str.length(); i++)
    	{
    		 hash = hash << 7 ^ str[i];
    	}
    	return hash;
    }
    
    unsigned int HashTable::DEKHash(const std::string& str)
    {
    	unsigned int hash = static_cast<unsigned int>(str.length());
    	for(std::size_t i = 0; i < str.length(); i++)
    	{
    		hash = ((hash << 5) ^ (hash >> 27)) ^ str[i];
    	}
    	return hash;
    }
    
    unsigned int HashTable::DJBHash(const std::string& str)
    {
    	unsigned int hash = 5381;
    
        for(std::size_t i = 0; i < str.length(); i++)
        {
            hash = ((hash << 5) + hash) + str[i];
        }
        return hash;
    }
    
    unsigned int HashTable::ELFHash(const std::string& str)
    {
    	unsigned int hash=0;
    	unsigned int x=0;
    	for(std::size_t i = 0; i < str.length(); i++)
    	{
    		hash=(hash<<4)+str[i];
    		if((x = hash & 0xF0000000L) != 0)
    			hash^=(x>>24);
    		hash&=~x;
    	}
    	return hash;
    }
    
    unsigned int HashTable::FNVHash(const std::string& str)
    {
    	const unsigned int fnv_prime = 0x811C9DC5;
        unsigned int hash = 0;
        for(std::size_t i = 0; i < str.length(); i++)
        {
             hash *= fnv_prime;
             hash ^= str[i];
        }
        return hash;
    }
    
    unsigned int HashTable::JSHash(const std::string& str)
    {
    	unsigned int hash = 1315423911;
    	for(std::size_t i = 0; i < str.length(); i++)
    	{
    		hash ^= ((hash << 5) + str[i] + (hash >> 2));
    	}
    	return hash;
    }
    
    unsigned int HashTable::PJWHash(const std::string& str)
    {
    	 unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8);
    	 unsigned int ThreeQuarters     = (unsigned int)((BitsInUnsignedInt  * 3) / 4);
    	 unsigned int OneEighth         = (unsigned int)(BitsInUnsignedInt / 8);
    	 unsigned int HighBits          = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
    	 unsigned int hash              = 0;
    	 unsigned int test              = 0;
    	 
         for(std::size_t i = 0; i < str.length(); i++)
    	 {
    		  hash = (hash << OneEighth) + str[i];
    		  if((test = hash & HighBits)  != 0)
    			  hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits));
    	 }
    	 return hash;
    }
    
    unsigned int HashTable::RSHash(const std::string& str)
    {
    	unsigned int b    = 378551;
        unsigned int a    = 63689;
        unsigned int hash = 0;
    
    	for(std::size_t i = 0; i < str.length(); i++)
    	{
    		hash = hash * a + str[i];
            a    = a * b;
    	}
    	return hash;
    }
    
    unsigned int HashTable::SDBMHash(const std::string& str)
    {
    	unsigned int hash = 0;
    	for(std::size_t i = 0; i < str.length(); i++)
    	{
    		hash = str[i] + (hash << 6) + (hash << 16) - hash;
    	}
    	return hash;
    }


    2:实现进程间的相互排斥处理函数(另外提供进行当前操作的进程ID,以便加锁机制)。以SingleTone类实现。该类仅仅能有静态函数Instance建立一个唯一的类对象。以相互排斥的方式实现对hashTable的基本操作。其中的变量加锁和解锁有mutex类来实现,详细參见代码:

    ////mutex.h
    #ifndef mutex_H
    #define mutex_H
    
    #pragma once
    
    #include "pthread.h"
    
    class mutex
    {
    	pthread_mutex_t& m_mutex;
    public:
    	mutex(pthread_mutex_t& m):m_mutex(m) 
    	{
    		pthread_mutex_lock(&m_mutex);
    	}
    
    	~mutex(void)
    	{
    		pthread_mutex_unlock(&m_mutex);
    	}
    };
    #endif
    


     

    ////SingleTone.h
    #ifndef SingleTone_H
    #define SingleTone_H
    
    #include <string>
    #include <list>
    #include <map>
    #include "Constants.h"
    #include "HashTable.h"
    #include "pthread.h"
    #include "curl/curl.h"
    
    class SingleTone{
    
    public:
    	static SingleTone* Instance();
    	void push_back(std::string s);
    	void pop_back();
    	int size();
    	std::list<std::string>::reference back();
    	std::list<std::string>::iterator begin();
    	std::list<std::string>::iterator end();
    	void push_front(std::string s);
    	bool empty();
    
    	unsigned int Get_m_UniqueMap_ForceAdd(const std::string& key,const std::string& url);
    	unsigned int Get_m_UniqueMap_Find(const std::string& key,const std::string& url);
    	HashTable Get_m_UniqueMap(const std::string& key);
    	void Set_m_UniqueMap(const std::string& key,HashTable& hash);
    	CURL* GetpCurl();
    	
    protected:
    	SingleTone();
    	~SingleTone();
    	pthread_mutex_t m_singleton_mutex;
    
    private:
    	static SingleTone* m_pSingleTone;
    	std::list<std::string> m_LinkStack;
    	std::map<std::string,HashTable> m_UniqueMap;
    	CURL *m_pcurl;
    };
    #endif
    #include "SingleTone.h"
    #include "mutex.h"
    
    
    SingleTone* SingleTone::m_pSingleTone=NULL;
    
    SingleTone::SingleTone()
    {
    	pthread_mutex_init(&m_singleton_mutex,NULL);
    	m_pcurl=curl_easy_init();
    }
    
    SingleTone::~SingleTone()
    {
    	pthread_mutex_destroy(&m_singleton_mutex);
    }
    
    SingleTone* SingleTone::Instance()
    {
    	if(m_pSingleTone==NULL){
    		m_pSingleTone=new SingleTone();
    	}
    	return (m_pSingleTone);
    }
    
    
    void SingleTone::push_back(std::string s)
    {
    	mutex m(m_singleton_mutex);
    	return m_LinkStack.push_back(s);
    }
    
    void SingleTone::pop_back()
    {
    	mutex m(m_singleton_mutex);
    	return m_LinkStack.pop_back();
    }
    
    int SingleTone::size()
    {
    	return m_LinkStack.size();
    }
    
    std::list<std::string>::iterator SingleTone::begin()
    {
    	return m_LinkStack.begin();
    }
    
    std::list<std::string>::reference SingleTone::back()
    {
    	mutex m(m_singleton_mutex);
    	return m_LinkStack.back();
    }
    
    std::list<std::string>::iterator SingleTone::end()
    {
        return m_LinkStack.end();
    }
    
    void SingleTone::push_front(std::string s)
    {
    	mutex  m(m_singleton_mutex);
        return m_LinkStack.push_front(s);
    }
    
    bool SingleTone::empty()
    {
    	return m_LinkStack.empty();
    }
    
    
    unsigned int SingleTone::Get_m_UniqueMap_ForceAdd(const std::string& key,const std::string& url)
    {
        mutex  m(m_singleton_mutex);
        return m_UniqueMap[key].ForceAdd(url);
    }
    
    
    unsigned int SingleTone::Get_m_UniqueMap_Find(const std::string& key,const std::string& url)
    {
        
        HashTable hss = m_UniqueMap[key];
        unsigned int uiRet =hss.Find(url);
        //unsigned int uiRet = m_UniqueMap[key]->Find(url);
        return uiRet;
    }
    
    
    HashTable SingleTone::Get_m_UniqueMap(const std::string& key)
    {
        return m_UniqueMap[key];
    }
    
    void SingleTone::Set_m_UniqueMap(const std::string& key,HashTable& hash)
    {
          m_UniqueMap[key] = hash;
          
    }
    
    CURL* SingleTone::GetpCurl()
    {
        return m_pcurl;
    }

    3:实现HTTP对网页内容的获取:功能包括初始网页内容的获取,和URL设置等函数。这个过程要求是相互排斥的,所以引入SingleTone类的内容。

    Code:

    /////Http.h
    #ifndef Http_H
    #define Http_H
    
    #include "curl/curl.h"
    #include "pthread.h"
    #include <string>
    
    using namespace std;
    
    class Http
    {
    public:
    	Http(void);
    	~Http(void);
    	bool InitCurl(void);
    	bool InitCurl(const std::string& url, std::string& szbuffer);
    	bool DeInitCurl();
    	void setUrl(const std::string& url);
    	string setUrl();
    	const string getBuffer();
    
    
    private:
    	static void writer(void* buffer,size_t size,size_t nmemb,void* f);
    	int setBuffer(char* buffer,size_t size,size_t nmemb);
    	CURL *m_pcurl;
    	char m_errorBuffer[CURL_ERROR_SIZE];
    	string m_szbuffer;
    	string m_szUrl;
    	pthread_mutex_t m_http_mutex;
    };
    #endif
    
    #include "Http.h"
    #include "SingleTone.h"
    #include "mutex.h"
    
    Http::Http(void)
    {
    	m_pcurl=SingleTone::Instance()->GetpCurl();
    }
    
    
    Http::~Http(void)
    {
    }
    
    
    bool Http::InitCurl(void)
    {
    	return false;
    }
    
    
    int Http::setBuffer(char *buffer, size_t size, size_t nmemb)
    {
    	int result = 0;
    	if (buffer!=NULL)
    	{
    		m_szbuffer.append(buffer, size * nmemb);
    		result = size * nmemb;
    	}
    	buffer = NULL ;   
        return result;
    }
    
    void Http::writer(void *buffer, size_t size, size_t nmemb,void* f)
    {
    	static_cast<Http*>(f)->setBuffer((char*)buffer,size,nmemb);
    }
    
    bool Http::InitCurl(const std::string& url, std::string& szbuffer)
    {
    	pthread_mutex_init(&m_http_mutex,NULL);
    	Http::m_szUrl=url;
    	CURLcode result;
    	if(m_pcurl)
    	{
    		curl_easy_setopt(m_pcurl, CURLOPT_ERRORBUFFER, Http::m_errorBuffer);
            curl_easy_setopt(m_pcurl, CURLOPT_URL,m_szUrl.c_str());
            curl_easy_setopt(m_pcurl, CURLOPT_HEADER, 0);
            curl_easy_setopt(m_pcurl, CURLOPT_FOLLOWLOCATION, 1);
            curl_easy_setopt(m_pcurl, CURLOPT_WRITEFUNCTION,Http::writer);
            curl_easy_setopt(m_pcurl, CURLOPT_WRITEDATA,this);
    
    		result = curl_easy_perform(m_pcurl);
    	}
    	if(result!=CURLE_OK)
    	    return false;
    	szbuffer=m_szbuffer;
    	m_szbuffer.clear();
    	m_szUrl.clear();
    	pthread_mutex_destroy(&m_http_mutex);
    	return true;
    }
    
    bool Http::DeInitCurl()
    {
        curl_easy_cleanup(m_pcurl);
        curl_global_cleanup();
        m_pcurl = NULL;
         
        return true;
    }
    
    const string Http::getBuffer()
    {
    	return m_szbuffer;
    }
    
    
    string Http::setUrl()
    {
    	return Http::m_szUrl;
    }
    
    void Http::setUrl(const std::string& url)
    {
        Http::m_szUrl = url;
    }

    当中 m_szbuffer存放网页的内容。

    初始网页的内容存放在Init形函数参数。




    版权声明:本文博客原创文章。博客,未经同意,不得转载。

  • 相关阅读:
    es6基础系列二:Number
    es6基础系列一:let和const
    linux常用命令
    input 事件与汉字输入法:使用compositionend事件解决
    正则表达式
    php 调试环境配置(mac)
    前端实习生:10个月的总结
    人生路:程序员、飞行员?
    科三流水账
    阻止pc端浏览器缩放js代码
  • 原文地址:https://www.cnblogs.com/blfshiye/p/4668313.html
Copyright © 2011-2022 走看看