zoukankan      html  css  js  c++  java
  • 简单分析C之Curl模块同php的curl和python的pycurl模块的关系

    缘起:以前一直喜欢用scrapy做爬虫,并且实践效果也很好,后来由于单位让自己写一套分布式爬虫(python实现),替代公司原有的爬虫(php实现),大致用于实践后,发现效果是比原来的效果好,原来能做配置的网站20个里能配置10个,现在20个里能配置16个,分析原因,是架构设计方面有那么一点点扩充性,在大致架构不变的基础上,可进行有限的扩展,而其实实现的原理都是通过CURL来实现的。

    php的curl,是在php发布程序的ext文件中,作为一个php自带的支持,需要改写php的配置文件,修改php.ini,将;extension=php_curl.dll前的分号去掉。

    python的pycurl,不是python自带的支持程序,python在做爬虫一般都是用urllib,urllib2,twisted等,比较少的使用pycurl.安装略.

    c的curl,是前面2个语言的curl父程序,是c的curl才有了php的curl和python的pycurl,同时,python的pycurl文档说明了只实现了部分功能,即是一个c的curl的阉割版。泪奔,原来用了那么长时间的东西,连冰山一角都没触碰,或者python的pycurl也只是会用其中的一个或少数几个功能。

    如何用:

    C的curl:

    #include <stdio.h>
    #include <curl/curl.h>

    int main(void)
    {
    CURL *curl;
    CURLcode res;

    curl = curl_easy_init();
    if(curl) {
    /* First set the URL that is about to receive our POST. This URL can
    just as well be a https:// URL if that is what should receive the
    data.
    */
    curl_easy_setopt(curl, CURLOPT_URL, "http://postit.example.com/moo.cgi");
    /* Now specify the POST data */
    curl_easy_setopt(curl, CURLOPT_POSTFIELDS, "name=daniel&project=curl");

    /* Perform the request, res will get the return code */
    res = curl_easy_perform(curl);

    /* always cleanup */
    curl_easy_cleanup(curl);
    }
    return 0;
    }

    php的curl:

    <?php

    $c = curl_init();
    curl_setopt($c, CURLOPT_URL, 'http://www.baidu.com');
    $data = curl_exec($c);
    curl_close($c);
    echo $c;

    ?>

    python的pycurl:

    import pycurl
    def body(buffer):
    print buffer
    c = pycurl.Curl()
    c.setopt(pycurl.URL, "http://www.baidu.com/")
    c.setopt(pycurl.WRITEFUNCTION, body)
    c.perform()


    主要原理:

    C:

    使用到的数据结构:

    typedef void CURL;  /*当初始化什么的时候只是一个void类型*/
    struct SessionHandle {
    struct Names dns;
    struct Curl_multi *multi; /* 用于多线程处理*/
    struct Curl_one_easy *multi_pos; /* if non-NULL, points to the its position
    in multi controlling structure to assist
    in removal.
    */
    struct Curl_share *share; /* Share, handles global variable mutexing */
    struct HandleData reqdata; /* Request-specific data */
    struct UserDefined set; /* values set by the libcurl user ,用于setopt等*/
    struct DynamicStatic change; /* possibly modified userdefined data */

    struct CookieInfo *cookies; /* the cookies, read from files and servers */
    struct Progress progress; /* for all the progress meter data */
    struct UrlState state; /* struct for fields used for state info and
    other dynamic purposes
    */
    struct PureInfo info; /* stats, reports and info data */
    #if defined(CURL_DOES_CONVERSIONS) && defined(HAVE_ICONV)
    iconv_t outbound_cd; /* for translating to the network encoding */
    iconv_t inbound_cd; /* for translating from the network encoding */
    iconv_t utf8_cd; /* for translating to UTF8 */
    #endif /* CURL_DOES_CONVERSIONS && HAVE_ICONV */
    unsigned int magic; /* set to a CURLEASY_MAGIC_NUMBER */
    };

    struct UserDefined {
    FILE *err; /* the stderr user data goes here */
    void *debugdata; /* the data that will be passed to fdebug */
    char *errorbuffer; /* (Static) store failure messages in here */
    long proxyport; /* If non-zero, use this port number by default. If the
    proxy string features a ":[port]" that one will override
    this.
    */
     /**一下省略10000行- -**/
    };


    使用的方法1:

    1.初始化curl,得到sessionhandler结构体空间
    CURL *curl_easy_init(void)
    {
    CURLcode res;
    struct SessionHandle *data;

    /* Make sure we inited the global SSL stuff */
    if (!initialized) {
    res = curl_global_init(CURL_GLOBAL_DEFAULT);
    if(res) {
    /* something in the global init failed, return nothing */
    DEBUGF(fprintf(stderr, "Error: curl_global_init failed\n"));
    return NULL;
    }
    }

    /* We use curl_open() with undefined URL so far */
    res = Curl_open(&data);
    if(res != CURLE_OK) {
    DEBUGF(fprintf(stderr, "Error: Curl_open failed\n"));
    return NULL;
    }

    return data;
    }

    方法2.

    设置参数:
    CURLcode curl_easy_setopt(CURL *curl, CURLoption tag, ...)
    {
    va_list arg;
    struct SessionHandle *data = curl;
    CURLcode ret;

    if(!curl)
    return CURLE_BAD_FUNCTION_ARGUMENT;

    va_start(arg, tag);

    ret = Curl_setopt(data, tag, arg);

    va_end(arg);
    return ret;
    }
    CURLcode Curl_setopt(struct SessionHandle *data, CURLoption option,
    va_list param)
    {
    char *argptr;
    CURLcode result = CURLE_OK;
    #ifndef CURL_DISABLE_HTTP
    curl_off_t bigsize;
    #endif

    switch(option) {
    case CURLOPT_DNS_CACHE_TIMEOUT:
    data->set.dns_cache_timeout = va_arg(param, long);
    break;
    case CURLOPT_DNS_USE_GLOBAL_CACHE:
    {
    long use_cache = va_arg(param, long);
    if (use_cache)
    Curl_global_host_cache_init();

    data->set.global_dns_cache = (bool)(0 != use_cache);
    }
    break;
    case CURLOPT_SSL_CIPHER_LIST:
    /* set a list of cipher we want to use in the SSL connection */
    result = Curl_setstropt(&data->set.str[STRING_SSL_CIPHER_LIST],
    va_arg(param, char *));
    break;

    case CURLOPT_RANDOM_FILE:
    /*
    * This is the path name to a file that contains random data to seed
    * the random SSL stuff with. The file is only used for reading.
    */
    result = Curl_setstropt(&data->set.str[STRING_SSL_RANDOM_FILE],
    va_arg(param, char *));
    break;
    case CURLOPT_EGDSOCKET:
    /*
    * The Entropy Gathering Daemon socket pathname
    */
    result = Curl_setstropt(&data->set.str[STRING_SSL_EGDSOCKET],
    va_arg(param, char *));
    break;
    case CURLOPT_MAXCONNECTS:
    /*
    * Set the absolute number of maximum simultaneous alive connection that
    * libcurl is allowed to have.
    */
    result = Curl_ch_connc(data, data->state.connc, va_arg(param, long));
    break;
    case CURLOPT_FORBID_REUSE:
    /*
    * When this transfer is done, it must not be left to be reused by a
    * subsequent transfer but shall be closed immediately.
    */
    data->set.reuse_forbid = (bool)(0 != va_arg(param, long));
    break;
    case CURLOPT_FRESH_CONNECT:
    /*
    * This transfer shall not use a previously cached connection but
    * should be made with a fresh new connect!
    */
    data->set.reuse_fresh = (bool)(0 != va_arg(param, long));
    break;
    case CURLOPT_VERBOSE:
    /*
    * Verbose means infof() calls that give a lot of information about
    * the connection and transfer procedures as well as internal choices.
    */
    data->set.verbose = (bool)(0 != va_arg(param, long));
    break;
    case CURLOPT_HEADER:
    /*
    * Set to include the header in the general data output stream.
    */
    data->set.include_header = (bool)(0 != va_arg(param, long));
    break;
    case CURLOPT_NOPROGRESS:
    /*
    * Shut off the internal supported progress meter
    */
    data->set.hide_progress = (bool)(0 != va_arg(param, long));
    if(data->set.hide_progress)
    data->progress.flags |= PGRS_HIDE;
    else
    data->progress.flags &= ~PGRS_HIDE;
    break;
    case CURLOPT_NOBODY:
    /*
    * Do not include the body part in the output data stream.
    */
    data->set.opt_no_body = (bool)(0 != va_arg(param, long));
    if(data->set.opt_no_body)
    /* in HTTP lingo, this means using the HEAD request */
    data->set.httpreq = HTTPREQ_HEAD;
    break;
    case CURLOPT_FAILONERROR:
    /*
    * Don't output the >=300 error code HTML-page, but instead only
    * return error.
    */
    data->set.http_fail_on_error = (bool)(0 != va_arg(param, long));
    break;
    case CURLOPT_UPLOAD:
    case CURLOPT_PUT:
    /*
    * We want to sent data to the remote host. If this is HTTP, that equals
    * using the PUT request.
    */
    data->set.upload = (bool)(0 != va_arg(param, long));
    if(data->set.upload)
    /* If this is HTTP, PUT is what's needed to "upload" */
    data->set.httpreq = HTTPREQ_PUT;
    break;
    case CURLOPT_FILETIME:
    /*
    * Try to get the file time of the remote document. The time will
    * later (possibly) become available using curl_easy_getinfo().
    */
    data->set.get_filetime = (bool)(0 != va_arg(param, long));
    break;
    case CURLOPT_FTP_CREATE_MISSING_DIRS:
    /*
    * An FTP option that modifies an upload to create missing directories on
    * the server.
    */
    data->set.ftp_create_missing_dirs = (bool)(0 != va_arg(param, long));
    break;
    case CURLOPT_FTP_RESPONSE_TIMEOUT:
    /*
    * An FTP option that specifies how quickly an FTP response must be
    * obtained before it is considered failure.
    */
    data->set.ftp_response_timeout = va_arg( param , long ) * 1000;
    break;
    case CURLOPT_DIRLISTONLY:
    /*
    * An option that changes the command to one that asks for a list
    * only, no file info details.
    */
    data->set.ftp_list_only = (bool)(0 != va_arg(param, long));
    break;
    case CURLOPT_APPEND:
    /*
    * We want to upload and append to an existing file.
    */
    data->set.ftp_append = (bool)(0 != va_arg(param, long));
    break;
    case CURLOPT_FTP_FILEMETHOD:
    /*
    * How do access files over FTP.
    */
    data->set.ftp_filemethod = (curl_ftpfile)va_arg(param, long);
    break;
    case CURLOPT_NETRC:
    /*
    * Parse the $HOME/.netrc file
    */
    data->set.use_netrc = (enum CURL_NETRC_OPTION)va_arg(param, long);
    break;
    case CURLOPT_NETRC_FILE:
    /*
    * Use this file instead of the $HOME/.netrc file
    */
    result = Curl_setstropt(&data->set.str[STRING_NETRC_FILE],
    va_arg(param, char *));
    break;
    case CURLOPT_TRANSFERTEXT:
    /*
    * This option was previously named 'FTPASCII'. Renamed to work with
    * more protocols than merely FTP.
    *
    * Transfer using ASCII (instead of BINARY).
    */
    data->set.prefer_ascii = (bool)(0 != va_arg(param, long));
    break;
    case CURLOPT_TIMECONDITION:
    /*
    * Set HTTP time condition. This must be one of the defines in the
    * curl/curl.h header file.
    */
    data->set.timecondition = (curl_TimeCond)va_arg(param, long);
    break;
    case CURLOPT_TIMEVALUE:
    /*
    * This is the value to compare with the remote document with the
    * method set with CURLOPT_TIMECONDITION
    */
    data->set.timevalue = (time_t)va_arg(param, long);
    break;
    case CURLOPT_SSLVERSION:
    /*
    * Set explicit SSL version to try to connect with, as some SSL
    * implementations are lame.
    */
    data->set.ssl.version = va_arg(param, long);
    break;

    #ifndef CURL_DISABLE_HTTP
    case CURLOPT_AUTOREFERER:
    /*
    * Switch on automatic referer that gets set if curl follows locations.
    */
    data->set.http_auto_referer = (bool)(0 != va_arg(param, long));
    break;

    case CURLOPT_ENCODING:
    /*
    * String to use at the value of Accept-Encoding header.
    *
    * If the encoding is set to "" we use an Accept-Encoding header that
    * encompasses all the encodings we support.
    * If the encoding is set to NULL we don't send an Accept-Encoding header
    * and ignore an received Content-Encoding header.
    *
    */
    argptr = va_arg(param, char *);
    result = Curl_setstropt(&data->set.str[STRING_ENCODING],
    (argptr && !*argptr)?
    (char *) ALL_CONTENT_ENCODINGS: argptr);
    break;

    case CURLOPT_FOLLOWLOCATION:
    /*
    * Follow Location: header hints on a HTTP-server.
    */
    data->set.http_follow_location = (bool)(0 != va_arg(param, long));
    break;

    case CURLOPT_UNRESTRICTED_AUTH:
    /*
    * Send authentication (user+password) when following locations, even when
    * hostname changed.
    */
    data->set.http_disable_hostname_check_before_authentication =
    (bool)(0 != va_arg(param, long));
    break;

    case CURLOPT_MAXREDIRS:
    /*
    * The maximum amount of hops you allow curl to follow Location:
    * headers. This should mostly be used to detect never-ending loops.
    */
    data->set.maxredirs = va_arg(param, long);
    break;

    case CURLOPT_POST301:
    /*
    * Obey RFC 2616/10.3.2 and resubmit a POST as a POST after a 301.
    */
    data->set.post301 = (bool)(0 != va_arg(param, long));
    break;

    case CURLOPT_POST:
    /* Does this option serve a purpose anymore? Yes it does, when
    CURLOPT_POSTFIELDS isn't used and the POST data is read off the
    callback!
    */
    if(va_arg(param, long)) {
    data->set.httpreq = HTTPREQ_POST;
    data->set.opt_no_body = FALSE; /* this is implied */
    }
    else
    data->set.httpreq = HTTPREQ_GET;
    break;

    case CURLOPT_COPYPOSTFIELDS:
    /*
    * A string with POST data. Makes curl HTTP POST. Even if it is NULL.
    * If needed, CURLOPT_POSTFIELDSIZE must have been set prior to
    * CURLOPT_COPYPOSTFIELDS and not altered later.
    */
    argptr = va_arg(param, char *);

    if (!argptr || data->set.postfieldsize == -1)
    result = Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], argptr);
    else {
    /*
    * Check that requested length does not overflow the size_t type.
    */

    if ((data->set.postfieldsize < 0) ||
    ((sizeof(curl_off_t) != sizeof(size_t)) &&
    (data->set.postfieldsize > (curl_off_t)((size_t)-1))))
    result = CURLE_OUT_OF_MEMORY;
    else {
    char * p;

    (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);

    /* Allocate even when size == 0. This satisfies the need of possible
    later address compare to detect the COPYPOSTFIELDS mode, and
    to mark that postfields is used rather than read function or
    form data.
    */
    p = malloc((size_t)(data->set.postfieldsize?data->set.postfieldsize:1));

    if (!p)
    result = CURLE_OUT_OF_MEMORY;
    else {
    if (data->set.postfieldsize)
    memcpy(p, argptr, data->set.postfieldsize);

    data->set.str[STRING_COPYPOSTFIELDS] = p;
    }
    }
    }

    data->set.postfields = data->set.str[STRING_COPYPOSTFIELDS];
    data->set.httpreq = HTTPREQ_POST;
    break;

    case CURLOPT_POSTFIELDS:
    /*
    * Like above, but use static data instead of copying it.
    */
    data->set.postfields = va_arg(param, void *);
    /* Release old copied data. */
    (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
    data->set.httpreq = HTTPREQ_POST;
    break;

    case CURLOPT_POSTFIELDSIZE:
    /*
    * The size of the POSTFIELD data to prevent libcurl to do strlen() to
    * figure it out. Enables binary posts.
    */
    bigsize = va_arg(param, long);

    if (data->set.postfieldsize < bigsize &&
    data->set.postfields == data->set.str[STRING_COPYPOSTFIELDS]) {
    /* Previous CURLOPT_COPYPOSTFIELDS is no longer valid. */
    (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
    data->set.postfields = NULL;
    }

    data->set.postfieldsize = bigsize;
    break;

    case CURLOPT_POSTFIELDSIZE_LARGE:
    /*
    * The size of the POSTFIELD data to prevent libcurl to do strlen() to
    * figure it out. Enables binary posts.
    */
    bigsize = va_arg(param, curl_off_t);

    if (data->set.postfieldsize < bigsize &&
    data->set.postfields == data->set.str[STRING_COPYPOSTFIELDS]) {
    /* Previous CURLOPT_COPYPOSTFIELDS is no longer valid. */
    (void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
    data->set.postfields = NULL;
    }

    data->set.postfieldsize = bigsize;
    break;

    case CURLOPT_HTTPPOST:
    /*
    * Set to make us do HTTP POST
    */
    data->set.httppost = va_arg(param, struct curl_httppost *);
    data->set.httpreq = HTTPREQ_POST_FORM;
    data->set.opt_no_body = FALSE; /* this is implied */
    break;

    case CURLOPT_REFERER:
    /*
    * String to set in the HTTP Referer: field.
    */
    if(data->change.referer_alloc) {
    free(data->change.referer);
    data->change.referer_alloc = FALSE;
    }
    result = Curl_setstropt(&data->set.str[STRING_SET_REFERER],
    va_arg(param, char *));
    data->change.referer = data->set.str[STRING_SET_REFERER];
    break;
    /**中间省略10000行case情况,但都是想data数据修正值*/

    default:
    /* unknown tag and its companion, just ignore: */
    result = CURLE_FAILED_INIT; /* correct this */
    break;
    }

    return result;
    }

    3.真正发送请求:

    CURLcode curl_easy_perform(CURL *easy)
    {
    CURLM *multi;
    CURLMcode mcode;
    CURLcode code = CURLE_OK;
    int still_running;
    struct timeval timeout;
    int rc;
    CURLMsg *msg;
    fd_set fdread;
    fd_set fdwrite;
    fd_set fdexcep;
    int maxfd;

    if(!easy)
    return CURLE_BAD_FUNCTION_ARGUMENT;

    multi = curl_multi_init();
    if(!multi)
    return CURLE_OUT_OF_MEMORY;

    mcode = curl_multi_add_handle(multi, easy);
    if(mcode) {
    curl_multi_cleanup(multi);
    if(mcode == CURLM_OUT_OF_MEMORY)
    return CURLE_OUT_OF_MEMORY;
    else
    return CURLE_FAILED_INIT;
    }

    /* we start some action by calling perform right away */

    do {
    while(CURLM_CALL_MULTI_PERFORM ==
    curl_multi_perform(multi, &still_running));

    if(!still_running)
    break;

    FD_ZERO(&fdread);
    FD_ZERO(&fdwrite);
    FD_ZERO(&fdexcep);

    /* timeout once per second */
    timeout.tv_sec = 1;
    timeout.tv_usec = 0;

    /* Old deprecated style: get file descriptors from the transfers */
    curl_multi_fdset(multi, &fdread, &fdwrite, &fdexcep, &maxfd);
    rc = Curl_select(maxfd+1, &fdread, &fdwrite, &fdexcep, &timeout);

    /* The way is to extract the sockets and wait for them without using
    select. This whole alternative version should probably rather use the
    curl_multi_socket() approach.
    */

    if(rc == -1)
    /* select error */
    break;

    /* timeout or data to send/receive => loop! */
    } while(still_running);

    msg = curl_multi_info_read(multi, &rc);
    if(msg)
    code = msg->data.result;

    mcode = curl_multi_remove_handle(multi, easy);
    /* what to do if it fails? */

    mcode = curl_multi_cleanup(multi);
    /* what to do if it fails? */

    return code;
    }

    4.从内存去除申请的空间:

    void curl_easy_cleanup(CURL *curl)
    {
    struct SessionHandle *data = (struct SessionHandle *)curl;

    if(!data)
    return;

    Curl_close(data);
    }

    php:

    1.使用的数据结构:

    typedef struct {
    struct _php_curl_error err;
    struct _php_curl_free *to_free;
    struct _php_curl_send_headers header;
    void ***thread_ctx;
    CURL *cp; /* php主要申请这个结构体,但这个结构体包含了C的CURL这个类型的结构体,所以可以采用ch->cp来设置这个结构体内容*/
    php_curl_handlers *handlers;
    long id;
    unsigned int uses;
    zend_bool in_callback;
    zval *clone;
    } php_curl;

    2. 使用的方法:

    PHP_FUNCTION(curl_init)
    {
    php_curl *ch;
    CURL *cp;
    zval *clone;
    char *url = NULL;
    int url_len = 0;
    char *cainfo;

    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|s", &url, &url_len) == FAILURE) {
    return;
    }

    cp = curl_easy_init();
    if (!cp) {
    php_error_docref(NULL TSRMLS_CC, E_WARNING, "Could not initialize a new cURL handle");
    RETURN_FALSE;
    }

    alloc_curl_handle(&ch);
    TSRMLS_SET_CTX(ch->thread_ctx);

    ch->cp = cp;

    ch->handlers->write->method = PHP_CURL_STDOUT;
    ch->handlers->write->type = PHP_CURL_ASCII;
    ch->handlers->read->method = PHP_CURL_DIRECT;
    ch->handlers->write_header->method = PHP_CURL_IGNORE;

    ch->uses = 0;

    MAKE_STD_ZVAL(clone);
    ch->clone = clone;



    curl_easy_setopt(ch->cp, CURLOPT_NOPROGRESS, 1);
    curl_easy_setopt(ch->cp, CURLOPT_VERBOSE, 0);
    curl_easy_setopt(ch->cp, CURLOPT_ERRORBUFFER, ch->err.str);
    curl_easy_setopt(ch->cp, CURLOPT_WRITEFUNCTION, curl_write);
    curl_easy_setopt(ch->cp, CURLOPT_FILE, (void *) ch);
    curl_easy_setopt(ch->cp, CURLOPT_READFUNCTION, curl_read);
    curl_easy_setopt(ch->cp, CURLOPT_INFILE, (void *) ch);
    curl_easy_setopt(ch->cp, CURLOPT_HEADERFUNCTION, curl_write_header);
    curl_easy_setopt(ch->cp, CURLOPT_WRITEHEADER, (void *) ch);
    curl_easy_setopt(ch->cp, CURLOPT_DNS_USE_GLOBAL_CACHE, 1);
    curl_easy_setopt(ch->cp, CURLOPT_DNS_CACHE_TIMEOUT, 120);
    curl_easy_setopt(ch->cp, CURLOPT_MAXREDIRS, 20); /* prevent infinite redirects */

    cainfo = INI_STR("curl.cainfo");
    if (cainfo && strlen(cainfo) > 0) {
    curl_easy_setopt(ch->cp, CURLOPT_CAINFO, cainfo);
    }

    #if defined(ZTS)
    curl_easy_setopt(ch->cp, CURLOPT_NOSIGNAL, 1);
    #endif

    if (url) {
    if (!php_curl_option_url(ch, url, url_len)) {
    _php_curl_close_ex(ch TSRMLS_CC);
    RETURN_FALSE;
    }
    }

    ZEND_REGISTER_RESOURCE(return_value, ch, le_curl);
    ch->id = Z_LVAL_P(return_value);
    }

    执行真实下载 

    PHP_FUNCTION(curl_exec)
    {
    CURLcode error;
    zval *zid;
    php_curl *ch;

    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zid) == FAILURE) {
    return;
    }

    ZEND_FETCH_RESOURCE(ch, php_curl *, &zid, -1, le_curl_name, le_curl);

    _php_curl_verify_handlers(ch, 1 TSRMLS_CC);

    _php_curl_cleanup_handle(ch);

    error = curl_easy_perform(ch->cp);
    SAVE_CURL_ERROR(ch, error);
    /* CURLE_PARTIAL_FILE is returned by HEAD requests */
    if (error != CURLE_OK && error != CURLE_PARTIAL_FILE) {
    if (ch->handlers->write->buf.len > 0) {
    smart_str_free(&ch->handlers->write->buf);
    }
    RETURN_FALSE;
    }

    if (ch->handlers->std_err) {
    php_stream *stream;
    stream = (php_stream*)zend_fetch_resource(&ch->handlers->std_err TSRMLS_CC, -1, NULL, NULL, 2, php_file_le_stream(), php_file_le_pstream());
    if (stream) {
    php_stream_flush(stream);
    }
    }

    if (ch->handlers->write->method == PHP_CURL_RETURN && ch->handlers->write->buf.len > 0) {
    smart_str_0(&ch->handlers->write->buf);
    RETURN_STRINGL(ch->handlers->write->buf.c, ch->handlers->write->buf.len, 1);
    }

    /* flush the file handle, so any remaining data is synched to disk */
    if (ch->handlers->write->method == PHP_CURL_FILE && ch->handlers->write->fp) {
    fflush(ch->handlers->write->fp);
    }
    if (ch->handlers->write_header->method == PHP_CURL_FILE && ch->handlers->write_header->fp) {
    fflush(ch->handlers->write_header->fp);
    }

    if (ch->handlers->write->method == PHP_CURL_RETURN) {
    RETURN_EMPTY_STRING();
    } else {
    RETURN_TRUE;
    }
    }

    关闭程序,清空内存

    PHP_FUNCTION(curl_close)
    {
    zval *zid;
    php_curl *ch;

    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zid) == FAILURE) {
    return;
    }

    ZEND_FETCH_RESOURCE(ch, php_curl *, &zid, -1, le_curl_name, le_curl);

    if (ch->in_callback) {
    php_error_docref(NULL TSRMLS_CC, E_WARNING, "Attempt to close cURL handle from a callback");
    return;
    }

    if (ch->uses) {
    ch->uses--;
    } else {
    zend_list_delete(Z_LVAL_P(zid));
    }
    }

    python的pycurl

    1.使用的数据结构:

    typedef struct {
    PyObject_HEAD
    PyObject *dict; /* Python attributes dictionary */
    CURL *handle; /*引用C的curl的数据结构*/
    PyThreadState *state;
    CurlMultiObject *multi_stack;
    CurlShareObject *share;
    struct curl_httppost *httppost;
    struct curl_slist *httpheader;
    struct curl_slist *http200aliases;
    struct curl_slist *quote;
    struct curl_slist *postquote;
    struct curl_slist *prequote;
    /* callbacks */
    PyObject *w_cb;
    PyObject *h_cb;
    PyObject *r_cb;
    PyObject *pro_cb;
    PyObject *debug_cb;
    PyObject *ioctl_cb;
    PyObject *opensocket_cb;
    /* file objects */
    PyObject *readdata_fp;
    PyObject *writedata_fp;
    PyObject *writeheader_fp;
    /* misc */
    void *options[OPTIONS_SIZE]; /* for OBJECTPOINT options */
    char error[CURL_ERROR_SIZE+1];
    } CurlObject;


    方法:

    1.初始化对象:

    static CurlObject *
    do_curl_new(PyObject *dummy)
    {
    CurlObject *self = NULL;
    int res;
    char *s = NULL;

    UNUSED(dummy);

    /* Allocate python curl object */
    self = util_curl_new();
    if (self == NULL)
    return NULL;

    /* Initialize curl handle */
    self->handle = curl_easy_init();
    if (self->handle == NULL)
    goto error;

    /* Set curl error buffer and zero it */
    res = curl_easy_setopt(self->handle, CURLOPT_ERRORBUFFER, self->error);
    if (res != CURLE_OK)
    goto error;
    memset(self->error, 0, sizeof(self->error));

    /* Set backreference */
    res = curl_easy_setopt(self->handle, CURLOPT_PRIVATE, (char *) self);
    if (res != CURLE_OK)
    goto error;

    /* Enable NOPROGRESS by default, i.e. no progress output */
    res = curl_easy_setopt(self->handle, CURLOPT_NOPROGRESS, (long)1);
    if (res != CURLE_OK)
    goto error;

    /* Disable VERBOSE by default, i.e. no verbose output */
    res = curl_easy_setopt(self->handle, CURLOPT_VERBOSE, (long)0);
    if (res != CURLE_OK)
    goto error;

    /* Set FTP_ACCOUNT to NULL by default */
    res = curl_easy_setopt(self->handle, CURLOPT_FTP_ACCOUNT, NULL);
    if (res != CURLE_OK)
    goto error;

    /* Set default USERAGENT */
    s = (char *) malloc(7 + strlen(LIBCURL_VERSION) + 1);
    if (s == NULL)
    goto error;
    strcpy(s, "PycURL/"); strcpy(s+7, LIBCURL_VERSION);
    res = curl_easy_setopt(self->handle, CURLOPT_USERAGENT, (char *) s); /*主要在这里调用c的curl的curl_easy_setopt方法,生成一个CURLsessionhandler结构体*/
    if (res != CURLE_OK) {
    free(s);
    goto error;
    }
    self->options[ OPT_INDEX(CURLOPT_USERAGENT) ] = s; s = NULL;

    /* Success - return new object */
    return self;

    error:
    Py_DECREF(self); /* this also closes self->handle */
    PyErr_SetString(ErrorObject, "initializing curl failed");
    return NULL;
    }

    2.设置参数 

    do_curl_setopt(CurlObject *self, PyObject *args)
    {
    int option;
    PyObject *obj;
    int res;

    if (!PyArg_ParseTuple(args, "iO:setopt", &option, &obj))
    return NULL;
    if (check_curl_state(self, 1 | 2, "setopt") != 0)
    return NULL;

    /* early checks of option value */
    if (option <= 0)
    goto error;
    if (option >= (int)CURLOPTTYPE_OFF_T + OPTIONS_SIZE)
    goto error;
    if (option % 10000 >= OPTIONS_SIZE)
    goto error;

    #if 0 /* XXX - should we ??? */
    /* Handle the case of None */
    if (obj == Py_None) {
    return util_curl_unsetopt(self, option);
    }
    #endif

    /* Handle the case of string arguments */
    if (PyString_Check(obj)) {
    char *str = NULL;
    Py_ssize_t len = -1;
    char *buf;
    int opt_index;

    /* Check that the option specified a string as well as the input */
    switch (option) {
    case CURLOPT_CAINFO:
    /*此处省略10000行,为pycurl未实现的curl的功能*/
    case CURLOPT_CRLFILE:
    case CURLOPT_ISSUERCERT:
    /* FIXME: check if more of these options allow binary data */
    str = PyString_AsString_NoNUL(obj);
    if (str == NULL)
    return NULL;
    break;
    case CURLOPT_POSTFIELDS:
    if (PyString_AsStringAndSize(obj, &str, &len) != 0)
    return NULL;
    /* automatically set POSTFIELDSIZE */
    if (len <= INT_MAX) {
    res = curl_easy_setopt(self->handle, CURLOPT_POSTFIELDSIZE, (long)len); /*可以看到pycurl的设置参数也就是使用的c的curl的curl_easy_setopt,即是对C的curl的一种封装*/
    } else {
    res = curl_easy_setopt(self->handle, CURLOPT_POSTFIELDSIZE_LARGE, (curl_off_t)len);
    }
    if (res != CURLE_OK) {
    CURLERROR_RETVAL();
    }
    break;
    default:
    PyErr_SetString(PyExc_TypeError, "strings are not supported for this option");
    return NULL;
    }
    /* Allocate memory to hold the string */
    assert(str != NULL);
    if (len <= 0)
    buf = strdup(str);
    else {
    buf = (char *) malloc(len);
    if (buf) memcpy(buf, str, len);
    }
    if (buf == NULL)
    return PyErr_NoMemory();
    /* Call setopt */
    res = curl_easy_setopt(self->handle, (CURLoption)option, buf);
    /* Check for errors */
    if (res != CURLE_OK) {
    free(buf);
    CURLERROR_RETVAL();
    }
    /* Save allocated option buffer */
    opt_index = OPT_INDEX(option);
    if (self->options[opt_index] != NULL) {
    free(self->options[opt_index]);
    self->options[opt_index] = NULL;
    }
    self->options[opt_index] = buf;
    Py_INCREF(Py_None);
    return Py_None;
    }


    3.关闭连接,或者说是删除内存中对象。

    static PyObject *
    do_curl_close(CurlObject *self)
    {
    if (check_curl_state(self, 2, "close") != 0) {
    return NULL;
    }
    util_curl_close(self); /*删除了CurlObject对象*/
    Py_INCREF(Py_None);
    return Py_None;
    }

    由以上分析可以看出,php的curl和python的curl都是对curl的一种封装,如果想写出一个更符合自己需求的配置型爬虫,可以考虑直接用C写,不过C的爬虫是不适合快速开发,这由代码量决定。

    当然更好的建议是使用webkit做爬虫,作为部分浏览器内核,毋庸置疑。以后再说.











  • 相关阅读:
    day22【网络编程】
    day21【缓冲流、转换流、序列化流】
    day20【字节流、字符流】
    设计模式7-适配器模式
    设计模式6-状态模式
    设计模式5-观察者模式
    设计模式4-建造者模式
    Web Service与WCF与Web API区别
    设计模式3-外观模式
    设计模式2-模板方法模式
  • 原文地址:https://www.cnblogs.com/CLTANG/p/curl_php_python.html
Copyright © 2011-2022 走看看