zoukankan      html  css  js  c++  java
  • 自学Python十一 Python爬虫总结

      通过几天的学习与尝试逐渐对python爬虫有了一些小小的心得,我们渐渐发现他们有很多共性,总是要去获取一系列的链接,读取网页代码,获取所需内容然后重复上面的工作,当自己运用的越来越熟练之后我们就会尝试着去总结一下爬虫的共性,试着去写个helper类以避免重复性劳动。

      参考:用python爬虫抓站的一些技巧总结 zz

      1.访问网站 #最简单的得到网页代码的方法

    1 import urllib2
    2 response = urllib2.urlopen("http://www.xx.com")
    3 print response.read()

      2.伪装成浏览器(User-Agent,Referer等) #为了不被服务器禁止访问所以还是伪装成浏览器比较好

    1 headers = {
    2     'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    3     'Referer':'http://www.xx.com/xx',
    4     'Accept':'application/javascript, */*;q=0.8'
    5 }
    6 response = urllib2.Request(url = "http://www.xx.com",data = None,headers = headers)

      3.Post数据转码

    1 import urllib,urllib2
    2 values = {
    3     'username':'xxx',
    4     'password':'xxx',
    5     'key':'xxx'
    6 }
    7 postdata = urllib.urlencode(values)
    8 response = urllib2.Request(url,data = postdata)

      4.Cookies

    1 import urllib2,cookielib
    2 cookie_handler = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
    3 opener = urllib2.build_opener(cookie_handler)
    4 urllib2.install_opener(opener)
    5 response = urllib2.urlopen(url)

      5.代理服务器 #重复多次访问同一网址 结果被封了ip或限制了访问次数

    1 import urllib2
    2 proxy_handler = urllib2.ProxyHandler({"http" : '42.121.6.80:8080'})
    3 opener = urllib2.build_opener(proxy_handler)
    4 urllib2.install_opener(opener)
    5 response = urllib2.urlopen(url)

      问:如果想cookie和proxy一起用怎么办?

      答:urllib2.build_opener可以放多个参数,即handler 如:BaseHandler,ProxyHandler,HTTPHandler,FileHandler,FTPHandler,CacheFTPHandler等等等等

      6.gzip #现在普遍支持gzip压缩,我们默认获取压缩后的网页,大大提高了抓取网页的效率,减少了带宽负荷。

    1 import urllib2,zlib
    2 req = urllib2.Request(url)
    3 req.add_header('Accept-encoding', 'gzip')
    4 response = urllib2.urlopen(req, timeout=120)
    5 html = response.read()
    6 gzipped = response.headers.get('Content-Encoding')
    7 if gzipped:
    8     html = zlib.decompress(html, 16+zlib.MAX_WBITS)

      7.其他

      设置线程栈大小:栈大小显著影响python的内存占用,方法如下:

     1 from threading import stack_size 2 stack_size(32768*16)

      设置超时

    1 import socket
    2 socket.setdefaulttimeout(10) #设置10秒后连接超时

      失败后重试

     1 def get(self,req,retries=3):
     2     try:
     3         response = self.opener.open(req)
     4         data = response.read()
     5     except Exception , what:
     6         print what,req
     7         if retries>0:
     8             return self.get(req,retries-1)
     9         else:
    10             print 'GET Failed',req
    11             return ''
    12     return data

      根据以上内容,我们可以写出便于配置解决重复性工作的自己的helper类:

     1 # -*- coding: utf-8 -*-
     2 import cookielib, urllib, urllib2, socket
     3 import zlib,StringIO
     4 class HttpClient:
     5   __cookie = cookielib.CookieJar()
     6   #代理设置,需要时添加(后续设置为多代理切换)
     7   #__proxy_handler = urllib2.ProxyHandler({"http" : '42.121.6.80:8080'})
     8   __req = urllib2.build_opener(urllib2.HTTPCookieProcessor(__cookie))#,__proxy_handler)
     9   __req.addheaders = [
    10     ('Accept', 'application/javascript, */*;q=0.8'),
    11     ('User-Agent', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)')
    12   ]
    13   urllib2.install_opener(__req)
    14 
    15   def Get(self, url, refer=None,retries=3):
    16     try:
    17       req = urllib2.Request(url)
    18       req.add_header('Accept-encoding', 'gzip')
    19       if not (refer is None):
    20         req.add_header('Referer', refer)
    21       response = urllib2.urlopen(req, timeout=120)
    22       html = response.read()
    23       gzipped = response.headers.get('Content-Encoding')
    24       if gzipped:
    25           html = zlib.decompress(html, 16+zlib.MAX_WBITS)
    26       return html
    27     except Exception,what:
    28         print what
    29         if retries>0:
    30             return self.Get(url,refer,retries-1)
    31         else:
    32             print "Get Failed",url
    33             return ''
    34     #except urllib2.HTTPError, e:
    35     #  return e.read()
    36     #except socket.timeout, e:
    37     #  return ''
    38     #except socket.error, e:
    39     #  return ''
    40 
    41   def Post(self, url, data, refer=None):
    42     try:
    43       req = urllib2.Request(url, urllib.urlencode(data))
    44       #req = urllib2.Request(url,data)
    45       if not (refer is None):
    46         req.add_header('Referer', refer)
    47       return urllib2.urlopen(req, timeout=120).read()
    48     except urllib2.HTTPError, e:
    49       return e.read()
    50     except socket.timeout, e:
    51       return ''
    52     except socket.error, e:
    53       return ''
    54 
    55   def Download(self, url, file):
    56     output = open(file, 'wb')
    57     output.write(urllib2.urlopen(url).read())
    58     output.close()
    59 
    60   def getCookie(self, key):
    61     for c in self.__cookie:
    62       if c.name == key:
    63         return c.value
    64     return ''
    65 
    66   def setCookie(self, key, val, domain):
    67     ck = cookielib.Cookie(version=0, name=key, value=val, port=None, port_specified=False, domain=domain, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)
    68     self.__cookie.set_cookie(ck)
    HttpClient

      至于多线程就参考网上找的这段代码好了,还支持并发。。。

     1 from threading import Thread
     2 from Queue import Queue
     3 from time import sleep
     4 #q是任务队列
     5 #NUM是并发线程总数
     6 #JOBS是有多少任务
     7 q = Queue()
     8 NUM = 2
     9 JOBS = 10
    10 #具体的处理函数,负责处理单个任务
    11 def do_somthing_using(arguments):
    12     print arguments
    13 #这个是工作进程,负责不断从队列取数据并处理
    14 def working():
    15     while True:
    16         arguments = q.get()
    17         do_somthing_using(arguments)
    18         sleep(1)
    19         q.task_done()
    20 #fork NUM个线程等待队列
    21 for i in range(NUM):
    22     t = Thread(target=working)
    23     t.setDaemon(True)
    24     t.start()
    25 #把JOBS排入队列
    26 for i in range(JOBS):
    27     q.put(i)
    28 #等待所有JOBS完成
    29 q.join()
    ThreadDemo

       爬虫就靠一段落吧,更深入的爬虫框架以及html解析库暂时放一放,让我考虑考虑接下来的内容,是pygame还是django!

      爬虫demo的github地址(刚学着玩git ):http://git.oschina.net/tabei/Python_spider

  • 相关阅读:
    PhpStorm6 创建yii framework项目
    RestSharp104.1反序化用法
    qt_plugin_instance: identifier not found 解决办法
    使用jsonlib2.1.jar报,org.apache.struts2.json.JSONWriter can not access a member of class org.apache.commons.dbcp.PoolingDataSource$PoolGuardConnectionWrapper with modifiers "public"转载
    win7下安装和卸载oracle 10g转载
    删除c,c++,java源文件中全部注释的Python脚本
    webkit允许跨域访问
    【JavaP6大纲】分布式事务篇:两阶段提交(2PC)
    【JavaP6大纲】功能设计篇:秒杀场景设计
    【JavaP6大纲】功能设计篇:库存超卖问题
  • 原文地址:https://www.cnblogs.com/jixin/p/5145813.html
Copyright © 2011-2022 走看看