缘起:以前一直喜欢用scrapy做爬虫,并且实践效果也很好,后来由于单位让自己写一套分布式爬虫(python实现),替代公司原有的爬虫(php实现),大致用于实践后,发现效果是比原来的效果好,原来能做配置的网站20个里能配置10个,现在20个里能配置16个,分析原因,是架构设计方面有那么一点点扩充性,在大致架构不变的基础上,可进行有限的扩展,而其实实现的原理都是通过CURL来实现的。
php的curl,是在php发布程序的ext文件中,作为一个php自带的支持,需要改写php的配置文件,修改php.ini,将;extension=php_curl.dll前的分号去掉。
python的pycurl,不是python自带的支持程序,python在做爬虫一般都是用urllib,urllib2,twisted等,比较少的使用pycurl.安装略.
c的curl,是前面2个语言的curl父程序,是c的curl才有了php的curl和python的pycurl,同时,python的pycurl文档说明了只实现了部分功能,即是一个c的curl的阉割版。泪奔,原来用了那么长时间的东西,连冰山一角都没触碰,或者python的pycurl也只是会用其中的一个或少数几个功能。
如何用:
C的curl:
#include <stdio.h>
#include <curl/curl.h>
int main(void)
{
CURL *curl;
CURLcode res;
curl = curl_easy_init();
if(curl) {
/* First set the URL that is about to receive our POST. This URL can
just as well be a https:// URL if that is what should receive the
data. */
curl_easy_setopt(curl, CURLOPT_URL, "http://postit.example.com/moo.cgi");
/* Now specify the POST data */
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, "name=daniel&project=curl");
/* Perform the request, res will get the return code */
res = curl_easy_perform(curl);
/* always cleanup */
curl_easy_cleanup(curl);
}
return 0;
}
php的curl:
<?php
$c = curl_init();
curl_setopt($c, CURLOPT_URL, 'http://www.baidu.com');
$data = curl_exec($c);
curl_close($c);
echo $c;
?>
python的pycurl:
import pycurl
def body(buffer):
print buffer
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.baidu.com/")
c.setopt(pycurl.WRITEFUNCTION, body)
c.perform()
主要原理:
C:
使用到的数据结构:
typedef void CURL; /*当初始化什么的时候只是一个void类型*/
struct SessionHandle {
struct Names dns;
struct Curl_multi *multi; /* 用于多线程处理*/
struct Curl_one_easy *multi_pos; /* if non-NULL, points to the its position
in multi controlling structure to assist
in removal. */
struct Curl_share *share; /* Share, handles global variable mutexing */
struct HandleData reqdata; /* Request-specific data */
struct UserDefined set; /* values set by the libcurl user ,用于setopt等*/
struct DynamicStatic change; /* possibly modified userdefined data */
struct CookieInfo *cookies; /* the cookies, read from files and servers */
struct Progress progress; /* for all the progress meter data */
struct UrlState state; /* struct for fields used for state info and
other dynamic purposes */
struct PureInfo info; /* stats, reports and info data */
#if defined(CURL_DOES_CONVERSIONS) && defined(HAVE_ICONV)
iconv_t outbound_cd; /* for translating to the network encoding */
iconv_t inbound_cd; /* for translating from the network encoding */
iconv_t utf8_cd; /* for translating to UTF8 */
#endif /* CURL_DOES_CONVERSIONS && HAVE_ICONV */
unsigned int magic; /* set to a CURLEASY_MAGIC_NUMBER */
};
struct UserDefined {
FILE *err; /* the stderr user data goes here */
void *debugdata; /* the data that will be passed to fdebug */
char *errorbuffer; /* (Static) store failure messages in here */
long proxyport; /* If non-zero, use this port number by default. If the
proxy string features a ":[port]" that one will override
this. */
/**一下省略10000行- -**/
};
使用的方法1:
1.初始化curl,得到sessionhandler结构体空间
CURL *curl_easy_init(void)
{
CURLcode res;
struct SessionHandle *data;
/* Make sure we inited the global SSL stuff */
if (!initialized) {
res = curl_global_init(CURL_GLOBAL_DEFAULT);
if(res) {
/* something in the global init failed, return nothing */
DEBUGF(fprintf(stderr, "Error: curl_global_init failed\n"));
return NULL;
}
}
/* We use curl_open() with undefined URL so far */
res = Curl_open(&data);
if(res != CURLE_OK) {
DEBUGF(fprintf(stderr, "Error: Curl_open failed\n"));
return NULL;
}
return data;
}
方法2.
设置参数:
CURLcode curl_easy_setopt(CURL *curl, CURLoption tag, ...)
{
va_list arg;
struct SessionHandle *data = curl;
CURLcode ret;
if(!curl)
return CURLE_BAD_FUNCTION_ARGUMENT;
va_start(arg, tag);
ret = Curl_setopt(data, tag, arg);
va_end(arg);
return ret;
}
CURLcode Curl_setopt(struct SessionHandle *data, CURLoption option,
va_list param)
{
char *argptr;
CURLcode result = CURLE_OK;
#ifndef CURL_DISABLE_HTTP
curl_off_t bigsize;
#endif
switch(option) {
case CURLOPT_DNS_CACHE_TIMEOUT:
data->set.dns_cache_timeout = va_arg(param, long);
break;
case CURLOPT_DNS_USE_GLOBAL_CACHE:
{
long use_cache = va_arg(param, long);
if (use_cache)
Curl_global_host_cache_init();
data->set.global_dns_cache = (bool)(0 != use_cache);
}
break;
case CURLOPT_SSL_CIPHER_LIST:
/* set a list of cipher we want to use in the SSL connection */
result = Curl_setstropt(&data->set.str[STRING_SSL_CIPHER_LIST],
va_arg(param, char *));
break;
case CURLOPT_RANDOM_FILE:
/*
* This is the path name to a file that contains random data to seed
* the random SSL stuff with. The file is only used for reading.
*/
result = Curl_setstropt(&data->set.str[STRING_SSL_RANDOM_FILE],
va_arg(param, char *));
break;
case CURLOPT_EGDSOCKET:
/*
* The Entropy Gathering Daemon socket pathname
*/
result = Curl_setstropt(&data->set.str[STRING_SSL_EGDSOCKET],
va_arg(param, char *));
break;
case CURLOPT_MAXCONNECTS:
/*
* Set the absolute number of maximum simultaneous alive connection that
* libcurl is allowed to have.
*/
result = Curl_ch_connc(data, data->state.connc, va_arg(param, long));
break;
case CURLOPT_FORBID_REUSE:
/*
* When this transfer is done, it must not be left to be reused by a
* subsequent transfer but shall be closed immediately.
*/
data->set.reuse_forbid = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_FRESH_CONNECT:
/*
* This transfer shall not use a previously cached connection but
* should be made with a fresh new connect!
*/
data->set.reuse_fresh = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_VERBOSE:
/*
* Verbose means infof() calls that give a lot of information about
* the connection and transfer procedures as well as internal choices.
*/
data->set.verbose = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_HEADER:
/*
* Set to include the header in the general data output stream.
*/
data->set.include_header = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_NOPROGRESS:
/*
* Shut off the internal supported progress meter
*/
data->set.hide_progress = (bool)(0 != va_arg(param, long));
if(data->set.hide_progress)
data->progress.flags |= PGRS_HIDE;
else
data->progress.flags &= ~PGRS_HIDE;
break;
case CURLOPT_NOBODY:
/*
* Do not include the body part in the output data stream.
*/
data->set.opt_no_body = (bool)(0 != va_arg(param, long));
if(data->set.opt_no_body)
/* in HTTP lingo, this means using the HEAD request */
data->set.httpreq = HTTPREQ_HEAD;
break;
case CURLOPT_FAILONERROR:
/*
* Don't output the >=300 error code HTML-page, but instead only
* return error.
*/
data->set.http_fail_on_error = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_UPLOAD:
case CURLOPT_PUT:
/*
* We want to sent data to the remote host. If this is HTTP, that equals
* using the PUT request.
*/
data->set.upload = (bool)(0 != va_arg(param, long));
if(data->set.upload)
/* If this is HTTP, PUT is what's needed to "upload" */
data->set.httpreq = HTTPREQ_PUT;
break;
case CURLOPT_FILETIME:
/*
* Try to get the file time of the remote document. The time will
* later (possibly) become available using curl_easy_getinfo().
*/
data->set.get_filetime = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_FTP_CREATE_MISSING_DIRS:
/*
* An FTP option that modifies an upload to create missing directories on
* the server.
*/
data->set.ftp_create_missing_dirs = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_FTP_RESPONSE_TIMEOUT:
/*
* An FTP option that specifies how quickly an FTP response must be
* obtained before it is considered failure.
*/
data->set.ftp_response_timeout = va_arg( param , long ) * 1000;
break;
case CURLOPT_DIRLISTONLY:
/*
* An option that changes the command to one that asks for a list
* only, no file info details.
*/
data->set.ftp_list_only = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_APPEND:
/*
* We want to upload and append to an existing file.
*/
data->set.ftp_append = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_FTP_FILEMETHOD:
/*
* How do access files over FTP.
*/
data->set.ftp_filemethod = (curl_ftpfile)va_arg(param, long);
break;
case CURLOPT_NETRC:
/*
* Parse the $HOME/.netrc file
*/
data->set.use_netrc = (enum CURL_NETRC_OPTION)va_arg(param, long);
break;
case CURLOPT_NETRC_FILE:
/*
* Use this file instead of the $HOME/.netrc file
*/
result = Curl_setstropt(&data->set.str[STRING_NETRC_FILE],
va_arg(param, char *));
break;
case CURLOPT_TRANSFERTEXT:
/*
* This option was previously named 'FTPASCII'. Renamed to work with
* more protocols than merely FTP.
*
* Transfer using ASCII (instead of BINARY).
*/
data->set.prefer_ascii = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_TIMECONDITION:
/*
* Set HTTP time condition. This must be one of the defines in the
* curl/curl.h header file.
*/
data->set.timecondition = (curl_TimeCond)va_arg(param, long);
break;
case CURLOPT_TIMEVALUE:
/*
* This is the value to compare with the remote document with the
* method set with CURLOPT_TIMECONDITION
*/
data->set.timevalue = (time_t)va_arg(param, long);
break;
case CURLOPT_SSLVERSION:
/*
* Set explicit SSL version to try to connect with, as some SSL
* implementations are lame.
*/
data->set.ssl.version = va_arg(param, long);
break;
#ifndef CURL_DISABLE_HTTP
case CURLOPT_AUTOREFERER:
/*
* Switch on automatic referer that gets set if curl follows locations.
*/
data->set.http_auto_referer = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_ENCODING:
/*
* String to use at the value of Accept-Encoding header.
*
* If the encoding is set to "" we use an Accept-Encoding header that
* encompasses all the encodings we support.
* If the encoding is set to NULL we don't send an Accept-Encoding header
* and ignore an received Content-Encoding header.
*
*/
argptr = va_arg(param, char *);
result = Curl_setstropt(&data->set.str[STRING_ENCODING],
(argptr && !*argptr)?
(char *) ALL_CONTENT_ENCODINGS: argptr);
break;
case CURLOPT_FOLLOWLOCATION:
/*
* Follow Location: header hints on a HTTP-server.
*/
data->set.http_follow_location = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_UNRESTRICTED_AUTH:
/*
* Send authentication (user+password) when following locations, even when
* hostname changed.
*/
data->set.http_disable_hostname_check_before_authentication =
(bool)(0 != va_arg(param, long));
break;
case CURLOPT_MAXREDIRS:
/*
* The maximum amount of hops you allow curl to follow Location:
* headers. This should mostly be used to detect never-ending loops.
*/
data->set.maxredirs = va_arg(param, long);
break;
case CURLOPT_POST301:
/*
* Obey RFC 2616/10.3.2 and resubmit a POST as a POST after a 301.
*/
data->set.post301 = (bool)(0 != va_arg(param, long));
break;
case CURLOPT_POST:
/* Does this option serve a purpose anymore? Yes it does, when
CURLOPT_POSTFIELDS isn't used and the POST data is read off the
callback! */
if(va_arg(param, long)) {
data->set.httpreq = HTTPREQ_POST;
data->set.opt_no_body = FALSE; /* this is implied */
}
else
data->set.httpreq = HTTPREQ_GET;
break;
case CURLOPT_COPYPOSTFIELDS:
/*
* A string with POST data. Makes curl HTTP POST. Even if it is NULL.
* If needed, CURLOPT_POSTFIELDSIZE must have been set prior to
* CURLOPT_COPYPOSTFIELDS and not altered later.
*/
argptr = va_arg(param, char *);
if (!argptr || data->set.postfieldsize == -1)
result = Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], argptr);
else {
/*
* Check that requested length does not overflow the size_t type.
*/
if ((data->set.postfieldsize < 0) ||
((sizeof(curl_off_t) != sizeof(size_t)) &&
(data->set.postfieldsize > (curl_off_t)((size_t)-1))))
result = CURLE_OUT_OF_MEMORY;
else {
char * p;
(void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
/* Allocate even when size == 0. This satisfies the need of possible
later address compare to detect the COPYPOSTFIELDS mode, and
to mark that postfields is used rather than read function or
form data.
*/
p = malloc((size_t)(data->set.postfieldsize?data->set.postfieldsize:1));
if (!p)
result = CURLE_OUT_OF_MEMORY;
else {
if (data->set.postfieldsize)
memcpy(p, argptr, data->set.postfieldsize);
data->set.str[STRING_COPYPOSTFIELDS] = p;
}
}
}
data->set.postfields = data->set.str[STRING_COPYPOSTFIELDS];
data->set.httpreq = HTTPREQ_POST;
break;
case CURLOPT_POSTFIELDS:
/*
* Like above, but use static data instead of copying it.
*/
data->set.postfields = va_arg(param, void *);
/* Release old copied data. */
(void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
data->set.httpreq = HTTPREQ_POST;
break;
case CURLOPT_POSTFIELDSIZE:
/*
* The size of the POSTFIELD data to prevent libcurl to do strlen() to
* figure it out. Enables binary posts.
*/
bigsize = va_arg(param, long);
if (data->set.postfieldsize < bigsize &&
data->set.postfields == data->set.str[STRING_COPYPOSTFIELDS]) {
/* Previous CURLOPT_COPYPOSTFIELDS is no longer valid. */
(void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
data->set.postfields = NULL;
}
data->set.postfieldsize = bigsize;
break;
case CURLOPT_POSTFIELDSIZE_LARGE:
/*
* The size of the POSTFIELD data to prevent libcurl to do strlen() to
* figure it out. Enables binary posts.
*/
bigsize = va_arg(param, curl_off_t);
if (data->set.postfieldsize < bigsize &&
data->set.postfields == data->set.str[STRING_COPYPOSTFIELDS]) {
/* Previous CURLOPT_COPYPOSTFIELDS is no longer valid. */
(void) Curl_setstropt(&data->set.str[STRING_COPYPOSTFIELDS], NULL);
data->set.postfields = NULL;
}
data->set.postfieldsize = bigsize;
break;
case CURLOPT_HTTPPOST:
/*
* Set to make us do HTTP POST
*/
data->set.httppost = va_arg(param, struct curl_httppost *);
data->set.httpreq = HTTPREQ_POST_FORM;
data->set.opt_no_body = FALSE; /* this is implied */
break;
case CURLOPT_REFERER:
/*
* String to set in the HTTP Referer: field.
*/
if(data->change.referer_alloc) {
free(data->change.referer);
data->change.referer_alloc = FALSE;
}
result = Curl_setstropt(&data->set.str[STRING_SET_REFERER],
va_arg(param, char *));
data->change.referer = data->set.str[STRING_SET_REFERER];
break;
/**中间省略10000行case情况,但都是想data数据修正值*/
default:
/* unknown tag and its companion, just ignore: */
result = CURLE_FAILED_INIT; /* correct this */
break;
}
return result;
}
3.真正发送请求:
CURLcode curl_easy_perform(CURL *easy)
{
CURLM *multi;
CURLMcode mcode;
CURLcode code = CURLE_OK;
int still_running;
struct timeval timeout;
int rc;
CURLMsg *msg;
fd_set fdread;
fd_set fdwrite;
fd_set fdexcep;
int maxfd;
if(!easy)
return CURLE_BAD_FUNCTION_ARGUMENT;
multi = curl_multi_init();
if(!multi)
return CURLE_OUT_OF_MEMORY;
mcode = curl_multi_add_handle(multi, easy);
if(mcode) {
curl_multi_cleanup(multi);
if(mcode == CURLM_OUT_OF_MEMORY)
return CURLE_OUT_OF_MEMORY;
else
return CURLE_FAILED_INIT;
}
/* we start some action by calling perform right away */
do {
while(CURLM_CALL_MULTI_PERFORM ==
curl_multi_perform(multi, &still_running));
if(!still_running)
break;
FD_ZERO(&fdread);
FD_ZERO(&fdwrite);
FD_ZERO(&fdexcep);
/* timeout once per second */
timeout.tv_sec = 1;
timeout.tv_usec = 0;
/* Old deprecated style: get file descriptors from the transfers */
curl_multi_fdset(multi, &fdread, &fdwrite, &fdexcep, &maxfd);
rc = Curl_select(maxfd+1, &fdread, &fdwrite, &fdexcep, &timeout);
/* The way is to extract the sockets and wait for them without using
select. This whole alternative version should probably rather use the
curl_multi_socket() approach. */
if(rc == -1)
/* select error */
break;
/* timeout or data to send/receive => loop! */
} while(still_running);
msg = curl_multi_info_read(multi, &rc);
if(msg)
code = msg->data.result;
mcode = curl_multi_remove_handle(multi, easy);
/* what to do if it fails? */
mcode = curl_multi_cleanup(multi);
/* what to do if it fails? */
return code;
}
4.从内存去除申请的空间:
void curl_easy_cleanup(CURL *curl)
{
struct SessionHandle *data = (struct SessionHandle *)curl;
if(!data)
return;
Curl_close(data);
}
php:
1.使用的数据结构:
typedef struct {
struct _php_curl_error err;
struct _php_curl_free *to_free;
struct _php_curl_send_headers header;
void ***thread_ctx;
CURL *cp; /* php主要申请这个结构体,但这个结构体包含了C的CURL这个类型的结构体,所以可以采用ch->cp来设置这个结构体内容*/
php_curl_handlers *handlers;
long id;
unsigned int uses;
zend_bool in_callback;
zval *clone;
} php_curl;
2. 使用的方法:
PHP_FUNCTION(curl_init)
{
php_curl *ch;
CURL *cp;
zval *clone;
char *url = NULL;
int url_len = 0;
char *cainfo;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|s", &url, &url_len) == FAILURE) {
return;
}
cp = curl_easy_init();
if (!cp) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Could not initialize a new cURL handle");
RETURN_FALSE;
}
alloc_curl_handle(&ch);
TSRMLS_SET_CTX(ch->thread_ctx);
ch->cp = cp;
ch->handlers->write->method = PHP_CURL_STDOUT;
ch->handlers->write->type = PHP_CURL_ASCII;
ch->handlers->read->method = PHP_CURL_DIRECT;
ch->handlers->write_header->method = PHP_CURL_IGNORE;
ch->uses = 0;
MAKE_STD_ZVAL(clone);
ch->clone = clone;
curl_easy_setopt(ch->cp, CURLOPT_NOPROGRESS, 1);
curl_easy_setopt(ch->cp, CURLOPT_VERBOSE, 0);
curl_easy_setopt(ch->cp, CURLOPT_ERRORBUFFER, ch->err.str);
curl_easy_setopt(ch->cp, CURLOPT_WRITEFUNCTION, curl_write);
curl_easy_setopt(ch->cp, CURLOPT_FILE, (void *) ch);
curl_easy_setopt(ch->cp, CURLOPT_READFUNCTION, curl_read);
curl_easy_setopt(ch->cp, CURLOPT_INFILE, (void *) ch);
curl_easy_setopt(ch->cp, CURLOPT_HEADERFUNCTION, curl_write_header);
curl_easy_setopt(ch->cp, CURLOPT_WRITEHEADER, (void *) ch);
curl_easy_setopt(ch->cp, CURLOPT_DNS_USE_GLOBAL_CACHE, 1);
curl_easy_setopt(ch->cp, CURLOPT_DNS_CACHE_TIMEOUT, 120);
curl_easy_setopt(ch->cp, CURLOPT_MAXREDIRS, 20); /* prevent infinite redirects */
cainfo = INI_STR("curl.cainfo");
if (cainfo && strlen(cainfo) > 0) {
curl_easy_setopt(ch->cp, CURLOPT_CAINFO, cainfo);
}
#if defined(ZTS)
curl_easy_setopt(ch->cp, CURLOPT_NOSIGNAL, 1);
#endif
if (url) {
if (!php_curl_option_url(ch, url, url_len)) {
_php_curl_close_ex(ch TSRMLS_CC);
RETURN_FALSE;
}
}
ZEND_REGISTER_RESOURCE(return_value, ch, le_curl);
ch->id = Z_LVAL_P(return_value);
}
执行真实下载
PHP_FUNCTION(curl_exec)
{
CURLcode error;
zval *zid;
php_curl *ch;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zid) == FAILURE) {
return;
}
ZEND_FETCH_RESOURCE(ch, php_curl *, &zid, -1, le_curl_name, le_curl);
_php_curl_verify_handlers(ch, 1 TSRMLS_CC);
_php_curl_cleanup_handle(ch);
error = curl_easy_perform(ch->cp);
SAVE_CURL_ERROR(ch, error);
/* CURLE_PARTIAL_FILE is returned by HEAD requests */
if (error != CURLE_OK && error != CURLE_PARTIAL_FILE) {
if (ch->handlers->write->buf.len > 0) {
smart_str_free(&ch->handlers->write->buf);
}
RETURN_FALSE;
}
if (ch->handlers->std_err) {
php_stream *stream;
stream = (php_stream*)zend_fetch_resource(&ch->handlers->std_err TSRMLS_CC, -1, NULL, NULL, 2, php_file_le_stream(), php_file_le_pstream());
if (stream) {
php_stream_flush(stream);
}
}
if (ch->handlers->write->method == PHP_CURL_RETURN && ch->handlers->write->buf.len > 0) {
smart_str_0(&ch->handlers->write->buf);
RETURN_STRINGL(ch->handlers->write->buf.c, ch->handlers->write->buf.len, 1);
}
/* flush the file handle, so any remaining data is synched to disk */
if (ch->handlers->write->method == PHP_CURL_FILE && ch->handlers->write->fp) {
fflush(ch->handlers->write->fp);
}
if (ch->handlers->write_header->method == PHP_CURL_FILE && ch->handlers->write_header->fp) {
fflush(ch->handlers->write_header->fp);
}
if (ch->handlers->write->method == PHP_CURL_RETURN) {
RETURN_EMPTY_STRING();
} else {
RETURN_TRUE;
}
}
关闭程序,清空内存
PHP_FUNCTION(curl_close)
{
zval *zid;
php_curl *ch;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "r", &zid) == FAILURE) {
return;
}
ZEND_FETCH_RESOURCE(ch, php_curl *, &zid, -1, le_curl_name, le_curl);
if (ch->in_callback) {
php_error_docref(NULL TSRMLS_CC, E_WARNING, "Attempt to close cURL handle from a callback");
return;
}
if (ch->uses) {
ch->uses--;
} else {
zend_list_delete(Z_LVAL_P(zid));
}
}
python的pycurl
1.使用的数据结构:
typedef struct {
PyObject_HEAD
PyObject *dict; /* Python attributes dictionary */
CURL *handle; /*引用C的curl的数据结构*/
PyThreadState *state;
CurlMultiObject *multi_stack;
CurlShareObject *share;
struct curl_httppost *httppost;
struct curl_slist *httpheader;
struct curl_slist *http200aliases;
struct curl_slist *quote;
struct curl_slist *postquote;
struct curl_slist *prequote;
/* callbacks */
PyObject *w_cb;
PyObject *h_cb;
PyObject *r_cb;
PyObject *pro_cb;
PyObject *debug_cb;
PyObject *ioctl_cb;
PyObject *opensocket_cb;
/* file objects */
PyObject *readdata_fp;
PyObject *writedata_fp;
PyObject *writeheader_fp;
/* misc */
void *options[OPTIONS_SIZE]; /* for OBJECTPOINT options */
char error[CURL_ERROR_SIZE+1];
} CurlObject;
方法:
1.初始化对象:
static CurlObject *
do_curl_new(PyObject *dummy)
{
CurlObject *self = NULL;
int res;
char *s = NULL;
UNUSED(dummy);
/* Allocate python curl object */
self = util_curl_new();
if (self == NULL)
return NULL;
/* Initialize curl handle */
self->handle = curl_easy_init();
if (self->handle == NULL)
goto error;
/* Set curl error buffer and zero it */
res = curl_easy_setopt(self->handle, CURLOPT_ERRORBUFFER, self->error);
if (res != CURLE_OK)
goto error;
memset(self->error, 0, sizeof(self->error));
/* Set backreference */
res = curl_easy_setopt(self->handle, CURLOPT_PRIVATE, (char *) self);
if (res != CURLE_OK)
goto error;
/* Enable NOPROGRESS by default, i.e. no progress output */
res = curl_easy_setopt(self->handle, CURLOPT_NOPROGRESS, (long)1);
if (res != CURLE_OK)
goto error;
/* Disable VERBOSE by default, i.e. no verbose output */
res = curl_easy_setopt(self->handle, CURLOPT_VERBOSE, (long)0);
if (res != CURLE_OK)
goto error;
/* Set FTP_ACCOUNT to NULL by default */
res = curl_easy_setopt(self->handle, CURLOPT_FTP_ACCOUNT, NULL);
if (res != CURLE_OK)
goto error;
/* Set default USERAGENT */
s = (char *) malloc(7 + strlen(LIBCURL_VERSION) + 1);
if (s == NULL)
goto error;
strcpy(s, "PycURL/"); strcpy(s+7, LIBCURL_VERSION);
res = curl_easy_setopt(self->handle, CURLOPT_USERAGENT, (char *) s); /*主要在这里调用c的curl的curl_easy_setopt方法,生成一个CURLsessionhandler结构体*/
if (res != CURLE_OK) {
free(s);
goto error;
}
self->options[ OPT_INDEX(CURLOPT_USERAGENT) ] = s; s = NULL;
/* Success - return new object */
return self;
error:
Py_DECREF(self); /* this also closes self->handle */
PyErr_SetString(ErrorObject, "initializing curl failed");
return NULL;
}
2.设置参数
do_curl_setopt(CurlObject *self, PyObject *args)
{
int option;
PyObject *obj;
int res;
if (!PyArg_ParseTuple(args, "iO:setopt", &option, &obj))
return NULL;
if (check_curl_state(self, 1 | 2, "setopt") != 0)
return NULL;
/* early checks of option value */
if (option <= 0)
goto error;
if (option >= (int)CURLOPTTYPE_OFF_T + OPTIONS_SIZE)
goto error;
if (option % 10000 >= OPTIONS_SIZE)
goto error;
#if 0 /* XXX - should we ??? */
/* Handle the case of None */
if (obj == Py_None) {
return util_curl_unsetopt(self, option);
}
#endif
/* Handle the case of string arguments */
if (PyString_Check(obj)) {
char *str = NULL;
Py_ssize_t len = -1;
char *buf;
int opt_index;
/* Check that the option specified a string as well as the input */
switch (option) {
case CURLOPT_CAINFO:
/*此处省略10000行,为pycurl未实现的curl的功能*/
case CURLOPT_CRLFILE:
case CURLOPT_ISSUERCERT:
/* FIXME: check if more of these options allow binary data */
str = PyString_AsString_NoNUL(obj);
if (str == NULL)
return NULL;
break;
case CURLOPT_POSTFIELDS:
if (PyString_AsStringAndSize(obj, &str, &len) != 0)
return NULL;
/* automatically set POSTFIELDSIZE */
if (len <= INT_MAX) {
res = curl_easy_setopt(self->handle, CURLOPT_POSTFIELDSIZE, (long)len); /*可以看到pycurl的设置参数也就是使用的c的curl的curl_easy_setopt,即是对C的curl的一种封装*/
} else {
res = curl_easy_setopt(self->handle, CURLOPT_POSTFIELDSIZE_LARGE, (curl_off_t)len);
}
if (res != CURLE_OK) {
CURLERROR_RETVAL();
}
break;
default:
PyErr_SetString(PyExc_TypeError, "strings are not supported for this option");
return NULL;
}
/* Allocate memory to hold the string */
assert(str != NULL);
if (len <= 0)
buf = strdup(str);
else {
buf = (char *) malloc(len);
if (buf) memcpy(buf, str, len);
}
if (buf == NULL)
return PyErr_NoMemory();
/* Call setopt */
res = curl_easy_setopt(self->handle, (CURLoption)option, buf);
/* Check for errors */
if (res != CURLE_OK) {
free(buf);
CURLERROR_RETVAL();
}
/* Save allocated option buffer */
opt_index = OPT_INDEX(option);
if (self->options[opt_index] != NULL) {
free(self->options[opt_index]);
self->options[opt_index] = NULL;
}
self->options[opt_index] = buf;
Py_INCREF(Py_None);
return Py_None;
}
3.关闭连接,或者说是删除内存中对象。
static PyObject *
do_curl_close(CurlObject *self)
{
if (check_curl_state(self, 2, "close") != 0) {
return NULL;
}
util_curl_close(self); /*删除了CurlObject对象*/
Py_INCREF(Py_None);
return Py_None;
}
由以上分析可以看出,php的curl和python的curl都是对curl的一种封装,如果想写出一个更符合自己需求的配置型爬虫,可以考虑直接用C写,不过C的爬虫是不适合快速开发,这由代码量决定。
当然更好的建议是使用webkit做爬虫,作为部分浏览器内核,毋庸置疑。以后再说.