zoukankan      html  css  js  c++  java
  • C语言爬虫

    C语言爬虫
    http://bbs.bccn.net/thread-504909-1-1.html
    https://cloud.tencent.com/developer/information/c%E8%AF%AD%E8%A8%80%E7%BC%96%E5%86%99%E7%88%AC%E8%99%AB
    参考爬http的爬虫代码,只要GET + source + HTTP/1.1 +host ,然后接受返回的消息就是网页内容。
    现在很多网站都相机改成https的了,我试了还是用爬http的代码爬了一下https的网站,但是得到的是  Your request has bad syntax or is inherently impossible to satisfy.
    抓包看了一下,
    1、www.zhihu.com
    三次握手后(应该是connect之后自带的三次握手之后吧?),作为访问端发送的还是第一个包还是 GET / HTTP /1.1 和其他的一些内容。但是这里其他的内容有:Host,Connection,Accept,Upgrade-Insecure-Requests,User-Agent,Referer,Accept-Encoding,Accept-Language,Cookie,udid,_zap,d_c0,l_cap_id(抓的是访问知乎首页的包),服务器返回的是 HTTP/1.1 302 Found 和一些参数包括cookie之类的东西.
    然后客户端和服务器互相发了两次TCP包(是不是也是tcp/ip帮我们已经实现的保持通联的或者确认的包?),之后就是client hello 、server hello

    2、www.taobao.com
    三次握手之后,直接就是 client hello和server hello

    问题:
    1、访问https网站到底需不需要先GET。如果需要的话,像访问知乎中Cookie,udid,_zap,d_c0,l_cap_id等参数是怎么确定的?(因为我直接用GET + source + HTTP/1.1 +host,返回的是 Your request has bad syntax or is inherently impossible to satisfy)


    下面是参考的爬虫代码
    程序代码:
    //#include <Windows.h>#include <string>
    #include <iostream>
    #include <fstream>
    #include <vector>
    #include "winsock2.h"
    #include <time.h>
    #include <queue>
    #include <hash_set>
    
    #pragma comment(lib, "ws2_32.lib") 
    using namespace std;
    
    #define DEFAULT_PAGE_BUF_SIZE 1048576
    
    queue<string> hrefUrl;
    hash_set<string> visitedUrl;
    hash_set<string> visitedImg;
    int depth=0;
    int g_ImgCnt=1;
    
    //解析URL,解析出主机名,资源名bool ParseURL( const string & url, string & host, string & resource){
        if ( strlen(url.c_str()) > 2000 ) {
            return false;
        }
    
        const char * pos = strstr( url.c_str(), "http://" );
        if( pos==NULL ) pos = url.c_str();
        else pos += strlen("http://");
        char pHost[100];
        char pResource[2000];
        if( strstr( pos, "/")==0 )
        {
            pResource[0]='/';pResource[1]=0;
            strcpy(pHost,pos);
        }
        else        
            sscanf( pos, "%[^/]%s", pHost, pResource );//以%s的形式读取,  %[^a]表示匹配非a的任意字符,直到遇到a停止读入
                //这里pHost取“http://”后面到第一个‘/’之间的字符串,从第一个‘/’到结束都传给pResource    host = pHost;
        resource = pResource;
        return true;
    }
    
    //使用Get请求,得到响应bool GetHttpResponse( const string & url, char * &response, int &bytesRead ){
        string host, resource;
        if(!ParseURL( url, host, resource )){
            cout << "Can not parse the url"<<endl;
            return false;
        }
        
        //建立socket    struct hostent * hp= gethostbyname( host.c_str() );
        if( hp==NULL ){
            cout<< "Can not find host address"<<endl;
            return false;
        }
    
        SOCKET sock = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP);
        if( sock == -1 || sock == -2 ){
            cout << "Can not create sock."<<endl;
            return false;
        }
    
        //建立服务器地址    SOCKADDR_IN sa;
        sa.sin_family = AF_INET;
        sa.sin_port = htons( 80 );
        //char addr[5];
        //memcpy( addr, hp->h_addr, 4 );
        //sa.sin_addr.s_addr = inet_addr(hp->h_addr);    memcpy( &sa.sin_addr, hp->h_addr, 4 );
    
        //建立连接    if( 0!= connect( sock, (SOCKADDR*)&sa, sizeof(sa) ) ){
            cout << "Can not connect: "<< url <<endl;
            closesocket(sock);
            return false;
        };
    
        //准备发送数据    string request = "GET " + resource + " HTTP/1.1
    Host:" + host + "
    Connection:Close
    
    ";//字符串拼接
    
        //发送数据    if( SOCKET_ERROR ==send( sock, request.c_str(), request.size(), 0 ) ){//request.size()=strlen(request.c_str)        cout << "send error" <<endl;
            closesocket( sock );
            return false;
        }
    
        //接收数据    int m_nContentLength = DEFAULT_PAGE_BUF_SIZE;
        char *pageBuf = (char *)malloc(m_nContentLength);
        memset(pageBuf, 0, m_nContentLength);
    
        bytesRead = 0;
        int ret = 1;
        cout <<"Read: ";
        while(ret > 0)
        {
            ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0);        
            if(ret > 0)
            {
                bytesRead += ret;
            }
    
            if( m_nContentLength - bytesRead<100)
            {
                cout << "
    Realloc memorry"<<endl;
                m_nContentLength *=2;
                pageBuf = (char*)realloc( pageBuf, m_nContentLength);       //重新分配内存        }
            cout << ret <<"";
        }
        cout <<endl;
    
        pageBuf[bytesRead] = '';
        response = pageBuf;
        closesocket( sock );
        return true;
        //cout<< response <<endl;}
    
    //提取所有的URL以及图片URLvoid HTMLParse ( string & htmlResponse, vector<string> & imgurls, const string & host ){
        //Sleep(10000);
        //找所有连接,加入queue中    const char *p= htmlResponse.c_str();
        char *tag="href="";//href的内容 就是指要跳转的路由 或 方法     const char *pos = strstr( p, tag );
        ofstream ofile("url.txt", ios::app);//以追加的方式打开文件    while( pos )
        {
            pos +=strlen(tag);
            const char * nextQ = strstr( pos, """ );
            if( nextQ )
            {
                char * url = new char[ nextQ-pos+1 ];
                //char url[100]; //固定大小的会发生缓冲区溢出的危险            sscanf( pos, "%[^"]", url);
                string surl = url;  // 转换成string类型,可以自动释放内存            if( visitedUrl.find( surl ) == visitedUrl.end() ){//   visitedUrl.find( surl )=visitedUrl.end() 表示visitedUrl中没有sur1                visitedUrl.insert( surl );
                    ofile << surl<<endl;
                    hrefUrl.push( surl );//将sur1插到队列末            }
                pos = strstr(pos, tag );
                delete [] url;  // 释放掉申请的内存        }
        }
        ofile << endl << endl;
        ofile.close();
    
        tag ="<img ";
        const char* att1= "src="";//规定图像|音视频等的 URL。    const char* att2="lazy-src="";//lazy_src=还是lazy-src=    const char *pos0 = strstr( p, tag );
        while( pos0 )
        {
            pos0 += strlen( tag );
            const char* pos2 = strstr( pos0, att2 );
            if( !pos2 || pos2 > strstr( pos0, ">") ) 
            {
                pos = strstr( pos0, att1);//非延迟加载项            if(!pos) 
                {
                    pos0 = strstr(att1, tag );
                    continue;
                } 
                else 
                {
                    pos = pos + strlen(att1);
                }
            }
            else //如果是延迟加载项        {
                pos = pos2 + strlen(att2);
            }
    
            const char * nextQ = strstr( pos, """);
            if( nextQ )
            {
                char * url = new char[nextQ-pos+1];
                sscanf( pos, "%[^"]", url);
                cout << url<<endl;
                string imgUrl = url;
                if( visitedImg.find( imgUrl ) == visitedImg.end() )
                {
                    visitedImg.insert( imgUrl );
                    imgurls.push_back( imgUrl );//imgurls作为模板队列,push_back的参数由单个字符变成string类型            }
                pos0 = strstr(pos0, tag );
                delete [] url;
            }
        }
        cout << "end of Parse this html"<<endl;
    }
    
    //把URL转化为文件名string ToFileName( const string &url ){
        string fileName;
        fileName.resize( url.size());
        int k=0;
        for( int i=0; i<(int)url.size(); i++){
            char ch = url[i];
            if( ch!='\'&&ch!='/'&&ch!=':'&&ch!='*'&&ch!='?'&&ch!='"'&&ch!='<'&&ch!='>'&&ch!='|')
                fileName[k++]=ch;
        }
        return fileName.substr(0,k) + ".txt";//从第0位开始,长度为k的字符串 + .txt}
    
    //下载图片到img文件夹void DownLoadImg( vector<string> & imgurls, const string &url ){
    
        //生成保存该url下图片的文件夹    string foldname = ToFileName( url );
        foldname = "./img/"+foldname;
        if(!CreateDirectory( foldname.c_str(),NULL ))
            cout << "Can not create directory:"<< foldname<<endl;
        char *image;
        int byteRead;
        for( int i=0; i<imgurls.size(); i++)
        {
            //判断是否为图片,bmp,jgp,jpeg,gif         string str = imgurls[i];
            int pos = str.find_last_of(".");//返回值为相对于起点的偏移位        if( pos == string::npos )
                continue;
            else
            {
                string ext = str.substr( pos+1, str.size()-pos-1 );
                if( ext!="bmp"&& ext!="jpg" && ext!="jpeg"&& ext!="gif"&&ext!="png")
                    continue;
            }
            //下载其中的内容        if( GetHttpResponse(imgurls[i], image, byteRead))
            {
                if ( strlen(image) ==0 ) 
                {
                    continue;
                }
                const char *p=image;
                const char * pos = strstr(p,"
    
    ")+strlen("
    
    ");
                int index = imgurls[i].find_last_of("/");
                if( index!=string::npos )
                {
                    string imgname = imgurls[i].substr( index , imgurls[i].size() );
                    ofstream ofile( foldname+imgname, ios::binary );
                    if( !ofile.is_open() )
                        continue;
                    cout <<g_ImgCnt++<< foldname+imgname<<endl;
                    ofile.write( pos, byteRead- (pos-p) );
                    ofile.close();
                }
                free(image);
            }
        }
    }
    
    //广度遍历void BFS( const string & url ){
        char * response;
        int bytes;
        // 获取网页的相应,放入response中。    if( !GetHttpResponse( url, response, bytes ) ){
            cout << "The url is wrong! ignore." << endl;
            return;
        }
        string httpResponse=response;
        free( response );
        string filename = ToFileName( url );
        ofstream ofile( "./html/"+filename );
        if( ofile.is_open() ){
            // 保存该网页的文本内容        ofile << httpResponse << endl;
            ofile.close();
        }
        vector<string> imgurls;
        //解析该网页的所有图片链接,放入imgurls里面    HTMLParse( httpResponse,  imgurls, url );
        
        //下载所有的图片资源    DownLoadImg( imgurls, url );
    }
    
    void main()
    {
        //初始化socket,用于tcp网络连接    WSADATA wsaData;
        if( WSAStartup(MAKEWORD(2,2), &wsaData) != 0 ){
            return;
        }
    
        // 创建文件夹,保存图片和网页文本文件    CreateDirectory( "./img",0);
        CreateDirectory("./html",0);
        //string urlStart = "http://hao.360.cn/meinvdaohang.html";
    
        // 遍历的起始地址
        // string urlStart = "http://www.wmpic.me/tupian";
        //string urlStart = "http://item.taobao.com/item.htm?spm=a230r.1.14.19.sBBNbz&id=36366887850&ns=1#detail";    string urlStart = "www.ruanyifeng.com/blog/2014/02/ssl_tls.html";
        
        // 使用广度遍历
        // 提取网页中的超链接放入hrefUrl中,提取图片链接,下载图片。    BFS( urlStart );
    
        // 访问过的网址保存起来    visitedUrl.insert( urlStart );
    
        while( hrefUrl.size()!=0 ){
            string url = hrefUrl.front();  // 从队列的最开始取出一个网址        cout << url << endl;
            BFS( url );                      // 遍历提取出来的那个网页,找它里面的超链接网页放入hrefUrl,下载它里面的文本,图片        hrefUrl.pop();                 // 遍历完之后,删除这个网址    }
        WSACleanup();
        system("pause");
        return;
    }
    
     
    搜索更多相关主题的帖子: inherently request source C语言 网页 
  • 相关阅读:
    Java使用google开源工具Thumbnailator实现图片压缩
    nginx基本配置
    CopyPropertis
    微服务(Microservices )简介
    jQuery ajax()使用serialize()提交form数据
    $.getJSON( )的使用方法简介
    理解 CSS 的 z-index 属性
    JS中的call()和apply()方法
    CSS文字换行详细解说
    如何实现JS函数的重载
  • 原文地址:https://www.cnblogs.com/xinxihua/p/14490329.html
Copyright © 2011-2022 走看看