zoukankan      html  css  js  c++  java
  • python爬虫基础

    Demo1:urllib使用

    #encoding:utf-8
    import urllib
    import urlparse
    def printlist(lines):
        for i in lines:
            print(i)
    def demo():
        s=urllib.urlopen('http://blog.kamidox.com')
        msg=s.info()
        #printlist(msg.items())
        #printlist(msg.headers)
        #print(s.getcode())
        #printlist(s.readlines())
        #print(msg.getheader("Content-Type"))
        printlist(dir(msg))#find the methods of the class
    def progress(blk,blk_size,total_size):
        print("%d/%d - %.02f%%"%(blk*blk_size,total_size,(float)(blk*blk_size)*100/total_size))
    def retrieve():
        fname,msg=urllib.urlretrieve('http://blog.kamidox.com','index.html',reporthook=progress)
        #print(fname)
        #printlist(msg.items())
    def urlencode():
        params={'score':100,'name':'pachongjichu','comment':'very good'}
        qs=urllib.urlencode(params)
        print(qs)
        print(urlparse.parse_qs(qs))
    if __name__ == '__main__':
        urlencode()
    

    Demo2:抓取图片

    #encoding:utf-8
    import urllib
    response=urllib.urlopen("http://placekitten.com/g/300/400")
    cat_img=response.read()
    with open('cat_300_400.jpg','wb')as f:#图片是二进制文件
        f.write(cat_img)
    print(response.info())
    #print(response.read())
    

    Demo3:有道词典翻译

    # encoding:utf-8
    import urllib
    import json
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    # 解决错误 UnicodeDecodeError: 'ascii' codec can't decode byte 0xe7 in position 0:
    content=raw_input('请输入要翻译的内容:')#raw_input是输入原始字符串
    url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=http://www.youdao.com/"
    data = {}#来自于form data那一项中的数据
    data['i']=content
    data['from']='AUTO'
    data['to']='AUTO'
    data['smartresult']='dict'
    data['client']='fanyideskweb'
    data['salt']='1497500950438'
    data['sign']='cd8af15baafdac91e90445ce25f3cea1'
    data['doctype']='json'
    data['version']='2.1'
    data['keyfrom']='fanyi.web'
    data['action']='FY_BY_CLICKBUTTON'
    data['typoResult']='true'
    data=urllib.urlencode(data).encode('utf-8')
    response=urllib.urlopen(url,data)
    html=response.read().decode('utf-8')
    #print(html)#此时得到的是json结构
    target=json.loads(html)
    #type(target)->dict
    print("翻译结果:%s"%(target['translateResult'][0][0]['tgt']))

    Demo4:代码隐藏和延迟请求

     

    # encoding:utf-8
    import urllib
    import json
    import time
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    # 解决错误 UnicodeDecodeError: 'ascii' codec can't decode byte 0xe7 in position 0:
    while True:
    content=raw_input('请输入要翻译的内容(输入”q!“退出程序):')#raw_input是输入原始字符串
    if content=='q!':
    break
    url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=http://www.youdao.com/"
    #request 对象生成之前
    head={}
    head['User-Agent']='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36'
    data = {}
    data['i']=content
    data['from']='AUTO'
    data['to']='AUTO'
    data['smartresult']='dict'
    data['client']='fanyideskweb'
    data['salt']='1497500950438'
    data['sign']='cd8af15baafdac91e90445ce25f3cea1'
    data['doctype']='json'
    data['version']='2.1'
    data['keyfrom']='fanyi.web'
    data['action']='FY_BY_CLICKBUTTON'
    data['typoResult']='true'
    data=urllib.urlencode(data).encode('utf-8')
    response=urllib.urlopen(url,data,head)
    html=response.read().decode('utf-8')
    #print(html)#此时得到的是json结构
    target=json.loads(html)
    #type(target)->dict
    print("翻译结果:%s"%(target['translateResult'][0][0]['tgt']))
    time.sleep(3)#第一个方案,延迟请求

     也可以在创建对象后再addheaders('User_Agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36')

    不过python2.7好像不可以

    但是如果你连续的下载那么就不太行了,不像一个正常人下载那样,每个ip单位时间就会有个数值

    第一个方案,是延迟请求

    第二个方案,是代理

    1.参数是一个字典{‘类型’:‘代理ip:端口号’}

    proxy_support=urllib.ProxyHandler({})

    2.定制、创建一个opener

    opener=urllib.build_opener(proxy_support)

    3.安装 opener

    urllib.install_opener(opener)

    或者opener.open(url)

    # coding: utf-8
    import urllib
    import urllib2
    import random
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    url='http://www.ip138.com/'
    iplist=['183.203.208.166:8118','111.1.32.28:81','119.6.144.73:81']
    proxy_support=urllib2.ProxyHandler({'http': random.choice(iplist)})
    
    opener=urllib2.build_opener(proxy_support)
    opener.addheaders=[('User_Agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36')]
    urllib2.install_opener(opener)
    response=urllib.urlopen(url)
    html=response.read()
    print(html)
    

      

  • 相关阅读:
    css---box-sizing
    float与inline-block的一些应用场景的区别
    一些html元素的最原始状态
    css之深入理解overflow
    css中的锚点
    新增UI样式
    zh-CN、zh-Hans区别
    SourceTree 3.3.6安装跳过注册安装
    Windows sever 由于管理员设置的策略,该磁盘处于脱机状态的解决方法。
    CentOS7.x安装VNC实录
  • 原文地址:https://www.cnblogs.com/BetterThanEver_Victor/p/7016970.html
Copyright © 2011-2022 走看看