zoukankan      html  css  js  c++  java
  • PycURL 绝版聊神 ITeye技术网站 教主张

    PycURL - 绝版聊神 - ITeye技术网站

    http://wiki.woodpecker.org.cn/moin/zspy

    代码见 http://zspy.googlecode.com



    张沈鹏 zsp007@gmail.com http://zsp.iteye.com/



    2008-1-23 16:42





    1. PycURL

    Pycurl http://pycurl.sourceforge.net/



    外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.



    从 http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.



    参考文献1,测试代码





    Toggle line numbers

       1

       2 #像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些

       3 import StringIO

       4

       5 html = StringIO.StringIO()

       6

       7 import pycurl

       8 c = pycurl.Curl()

       9

      10 c.setopt(pycurl.URL, 'http://www.baidu.com')

      11

      12 #写的回调

      13 c.setopt(pycurl.WRITEFUNCTION, html.write)

      14

      15 c.setopt(pycurl.FOLLOWLOCATION, 1)

      16

      17 #最大重定向次数,可以预防重定向陷阱

      18 c.setopt(pycurl.MAXREDIRS, 5)

      19

      20 #访问,阻塞到访问结束

      21 c.perform()

      22

      23 #打印出 200(HTTP状态码) http://www.baidu.com(生效的url)

      24 print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)

      25

      26 #输出百度首页的html

      27 #print html.getvalue()

    然后看看多线程,http://pycurl.cvs.sourceforge.net/pycurl/pycurl/tests/ 有很多例子,还可做参考http://pycurl.sourceforge.net/doc/curlmultiobject.html



    我自己改写了一个:)





    Toggle line numbers

       1

       2 #!/usr/bin/env python

       3 #coding=utf-8

       4

       5 import threading

       6 import pycurl

       7 from cStringIO import StringIO

       8

       9 class UrlOpen(threading.Thread):

      10     """异步下载网页"""

      11

      12     def __init__(self):

      13         super(UrlOpen,self).__init__()

      14         self.opener = pycurl.CurlMulti()

      15         self.handle_list=[]

      16

      17     def add(self,url,recall,writer=StringIO()):

      18         """

      19         参数:网址,回调函数,存放临时数据的对象

      20         """

      21         c = pycurl.Curl()

      22

      23         #可以传给回调函数

      24         c.url=url

      25         c.content = writer

      26         c.recall = recall

      27         c.setopt(c.URL,url)

      28         c.setopt(c.WRITEFUNCTION,c.content.write)

      29

      30         self.handle_list.append(c)

      31         self.opener.add_handle(c)

      32

      33     def _remove(self,c):

      34         c.close()

      35         self.opener.remove_handle(c)

      36         self.handle_list.remove(c)

      37

      38

      39     def run(self):

      40         num_handle=len(self.handle_list)

      41         while 1:

      42             ret = self.opener.select(10.0)

      43             if ret == -1:  continue

      44             while 1:

      45                 num_handle_pre=num_handle

      46                 ret, num_handle =self.opener.perform()

      47                 #活动的连接数改变时

      48                 if num_handle!=num_handle_pre:

      49                     result=self.opener.info_read()

      50                     print result

      51                     for i in result[1]:

      52                         #成功

      53                         i.http_code = i.getinfo(i.HTTP_CODE)

      54                         self._remove(i)

      55                         i.recall(i)

      56                     for i in result[2]:

      57                         #失败,应该记录一下

      58                         self._remove(i)

      59

      60                 if ret != pycurl.E_CALL_MULTI_PERFORM:

      61                     break

      62

      63 _opener=None

      64 def urlopen(*arg,**key):

      65     global _opener

      66     if _opener is None:

      67         _opener=UrlOpen()

      68         _opener.add(*arg,**key)

      69         _opener.start()

      70     else:

      71         _opener.add(*arg,**key)

      72

      73 def show(x):

      74     print x.content.getvalue()

      75 if __name__=="__main__":

      76     urlopen("http://www.baidu.com/",show)

      77     _opener.join()

    又封装了一个异步打开网页的类和函数





    Toggle line numbers

       1 #coding=utf-8

       2

       3 import threading

       4 from cStringIO import StringIO

       5

       6 import pycurl

       7 """

       8 Asyn open url

       9 Author:zsp007@gmail.com

      10 2008-1-25 17:14

      11 """

      12

      13 class UrlOpen(threading.Thread):

      14     """异步下载网页"""

      15

      16     def __init__(self,):

      17         super(UrlOpen,self).__init__()

      18         self.opener = pycurl.CurlMulti()

      19         self.handle_list=[]

      20         self.waiting=[]

      21

      22     def add(self,url,recall,catch=None,writer=StringIO()):

      23         """

      24         参数:网址,回调函数,存放临时数据的对象

      25         """

      26         if catch is None:

      27             def catch(curl,error_no,desp):

      28                 #print "Error:%s - %s"%(error_no,desp)

      29                 pass

      30

      31         c = pycurl.Curl()

      32

      33         #可以传给回调函数

      34         c.url=url

      35         c.content = writer

      36         c.recall = recall

      37         c.catch=catch

      38         c.setopt(c.URL,

      39             url.encode('utf-8') if type(url) is unicode else url

      40         )

      41         c.setopt(c.WRITEFUNCTION,c.content.write)

      42

      43         self.waiting.append(c)

      44

      45     def _add(self):

      46         waiting=self.waiting[:]

      47         self.waiting=[]

      48         for c in waiting:

      49             self.handle_list.append(c)

      50             self.opener.add_handle(c)

      51

      52     def _remove(self,c):

      53         c.close()

      54         self.opener.remove_handle(c)

      55         self.handle_list.remove(c)

      56

      57

      58     def run(self):

      59         import select

      60         import time

      61         num_handle=0

      62         while 1:

      63             if self.handle_list:

      64                 ret = self.opener.select(1.0)

      65                 if ret >= 0:

      66                     while 1:

      67                         num_handle_pre=num_handle

      68                         ret, num_handle =self.opener.perform()

      69                         #活动的连接数改变时

      70                         if num_handle!=num_handle_pre:

      71                             result=self.opener.info_read()

      72                             for i in result[1]:

      73                                 #成功

      74                                 i.http_code = i.getinfo(i.HTTP_CODE)

      75                                 self._remove(i)

      76                                 i.recall(i)

      77                             for i in result[2]:

      78                                 #失败,应该记录一下,或回调失败函数

      79                                 #i为(<pycurl.Curl object at 0x00C04C80>, 6, 'Could not resolve host: www.msn.com (Domain name not found)')

      80                                 i[0].catch(*i)

      81                                 self._remove(i[0])

      82                         if ret != pycurl.E_CALL_MULTI_PERFORM:

      83                             break

      84             else:

      85                 time.sleep(1)

      86             self._add()

      87

      88 _opener=None

      89 def urlopen(*arg,**key):

      90     global _opener

      91     if _opener is None:

      92         _opener=UrlOpen()

      93         _opener.start()

      94     _opener.add(*arg,**key)

      95

      96 if __name__=="__main__":

      97     def show(x):

      98         print x.content.getvalue()

      99         print '--'*11

    100     urlopen("http://www.baidu.com/",show)

    101     urlopen("http://www.google.com/",show)

    102     urlopen("http://www.sougou.com/",show)

    103     urlopen("http://www.yodao.com/",show)

    104     urlopen("http://www.yahoo.com/",show)

    105     urlopen("http://www.msn.com/",show)

    106     _opener.join()



    1.1. 相关文献

    PycURL简单学习 http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx



    python中的pycurl模块学习 https://forum.eviloctal.com/read.php?tid=27337

  • 相关阅读:
    Python pandas检查数据中是否有NaN的几种方法
    实现one hot encode独热编码的两种方法
    深度学习框架-caffe安装-环境[Mac OSX 10.12]
    Git使用记录
    求解大于某数的下一个素数
    caffe cifar10试跑问题总结
    深度学习框架-caffe安装-Mac OSX 10.12
    常用bash命令
    vim文本编辑器
    第十八周助教总结
  • 原文地址:https://www.cnblogs.com/lexus/p/2437434.html
Copyright © 2011-2022 走看看