zoukankan      html  css  js  c++  java
  • 抓取url的超链接程序(使用到libcurl和libxml2)

    抓取url的超链接程序(使用到libcurl和libxml2)

    分类: Linux程序设计(C/C++) 215人阅读 评论(0) 收藏 举报

    抓取url的超链接程序(使用到libcurl和libxml2)

    写了一个c++语言小程序,当作练习。
    c++文件:
    #include <iostream>
    #include <stdio.h>
    #include <string.h>
    #include <stdlib.h>
    #include <string>
    #include <vector>
    #include <curl/curl.h>
    #include <libxml/HTMLparser.h>

    using namespace std;

    #ifdef _MSC_VER
    #define COMPARE(a, b) (!stricmp((a), (b)))
    #else
    #define COMPARE(a, b) (!strcasecmp((a), (b)))
    #endif

    typedef struct LinkStringDefined
    {
        string url;
        string anthor_text;
    } LinkString;

    typedef struct ContextDefined
    {
        ContextDefined(): addTitle(false) { }
        bool addTitle;
        string title;
        string url;
        vector<LinkString> terms;
    } Context;

    static char errorBuffer[CURL_ERROR_SIZE];
    static string buffer;
    static int writer(char *, size_t, size_t, string *);
    static bool init(CURL *&, char *);
    static void parseHtml(const string &, vector<LinkString> &);
    static void StartElement(void *, const xmlChar *, const xmlChar **);
    static void EndElement(void *, const xmlChar *);
    static void Characters(void *, const xmlChar *, int);
    static void CdataBlock(void *, const xmlChar *, int);

    int main(int argc, char* argv[])
    {
        CURL *conn = NULL;
        CURLcode code;
        vector<LinkString> arr;
        if (argc != 2)
        {
            fprintf(stderr, "Usage: %s <url>/n", argv[0]);
            exit(EXIT_FAILURE);
        }
        curl_global_init(CURL_GLOBAL_DEFAULT);
        if (!init(conn, argv[1]))
        {
            fprintf(stderr, "Connection initializion failed/n");
            exit(EXIT_FAILURE);
        }
        code = curl_easy_perform(conn);
        curl_easy_cleanup(conn);
        if (code != CURLE_OK)
        {
            fprintf(stderr, "Failed to get '%s' [%s]/n", argv[1], errorBuffer);
            exit(EXIT_FAILURE);
        }
        parseHtml(buffer, arr);
        int arr_size = arr.size();
        for(int i = 0; i < arr_size; i ++)
        {
            cout << arr[i].anthor_text << "/t" << arr[i].url << endl;
        }
        return 0;
    }

    static bool init(CURL *&conn, char *url)
    {
        CURLcode code;
        conn = curl_easy_init();
        if (conn == NULL)
        {
            fprintf(stderr, "Failed to create CURL connection/n");
            exit(EXIT_FAILURE);
        }
        code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
        if (code != CURLE_OK)
        {
            fprintf(stderr, "Failed to set error buffer [%d]/n", code);
            return false;
        }
        code = curl_easy_setopt(conn, CURLOPT_URL, url);
        if (code != CURLE_OK)
        {
            fprintf(stderr, "Failed to set URL [%s]/n", errorBuffer);
            return false;
        }
        code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1);
        if (code != CURLE_OK)
        {
            fprintf(stderr, "Failed to set redirect option [%s]/n", errorBuffer);
            return false;
        }
        code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
        if (code != CURLE_OK)
        {
            fprintf(stderr, "Failed to set writer [%s]/n", errorBuffer);
            return false;
        }
        code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
        if (code != CURLE_OK)
        {
            fprintf(stderr, "Failed to set write data [%s]/n", errorBuffer);
            return false;
        }
        return true;
    }

    static int writer(char *data, size_t size, size_t nmemb, string *writerData)
    {
        unsigned long long sizes = size * nmemb;
        if (writerData == NULL) return 0;
        writerData->append(data, sizes);
        return sizes;
    }

    static htmlSAXHandler saxHandler =
    {
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        StartElement,
        EndElement,
        NULL,
        Characters,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        CdataBlock,
        NULL
    };

    static void parseHtml(const string &html, vector<LinkString> &arr)
    {
        htmlParserCtxtPtr ctxt;
        Context context;
        ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "", XML_CHAR_ENCODING_NONE);
        htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
        htmlParseChunk(ctxt, "", 0, 1);
        htmlFreeParserCtxt(ctxt);
        arr = context.terms;
    }

    static void StartElement(void *voidContext, const xmlChar *name, const xmlChar **attributes)
    {
    //    int sz_att = sizeof(**attributes) / sizeof(xmlChar);
        Context *context = (Context *)voidContext;
        if (COMPARE((char *)name, "a"))
        {
            context->url = (char *)attributes[1];
            context->title = "";
            context->addTitle = true;
        }
    }

    static void EndElement(void *voidContext, const xmlChar *name)
    {
        Context *context = (Context *)voidContext;
        if (COMPARE((char *)name, "a"))
        {
            context->url = "";
            context->addTitle = false;
        }
    }

    static void handleCharacters(Context *context, const xmlChar *chars, int length)
    {
        LinkString linkString;
        if (context->addTitle)
        {
            context->title.append((char *)chars, length);
            linkString.anthor_text = context->title;
            linkString.url = context->url;
            context->terms.push_back(linkString);
        }
    }

    static void Characters(void *voidContext, const xmlChar *chars, int length)
    {
        Context *context = (Context *)voidContext;
        handleCharacters(context, chars, length);
    }

    static void CdataBlock(void *voidContext, const xmlChar *chars, int length)
    {
        Context *context = (Context *)voidContext;
        handleCharacters(context, chars, length);
    }

    Makefile文件:
    CXX = g++

    WARNING = -Wall
    GDBDEBUG = -g
    LIBS = -L. 
    DEFAULT_INCLUDE = -I. -I/usr/include
    ADDED_INCLUDE= -I /usr/include/libxml2
    OPTIMIZE = -O2

    allprog = get_url_info
    object1 = get_url_info.o
    complied = $(CXX) $(GDBDEBUG) $(WARNING) $(OPTIMIZE) $(DEFAULT_INCLUDE) $(ADDED_INCLUDE) -c $*.cpp

    all : ${allprog}
    .cpp.o :
        $(complied)
    get_url_info : $(object1)
        $(CXX) $(GDBDEBUG) $(WARNING) -lcurl -lxml2 $(OPTIMIZE) -o get_url_info $(object1)

    .PHONY : cleanall clean cleanobj
    cleanall :
        -rm *.o ${allprog}
    clean :
        -rm $(object1) get_url_info
    cleanobj :
        -rm *.o
  • 相关阅读:
    grunt in webstorm
    10+ Best Responsive HTML5 AngularJS Templates
    响应式布局
    responsive grid
    responsive layout
    js event bubble and capturing
    Understanding Service Types
    To add private variable to this Javascript literal object
    Centering HTML elements larger than their parents
    java5 新特性
  • 原文地址:https://www.cnblogs.com/moonvan/p/2174461.html
Copyright © 2011-2022 走看看