zoukankan      html  css  js  c++  java
  • python爬虫

    python如何访问互联网 url+lib==urllib

      url protocol://hostname[:port]/path/[;parameters][?query][#fragement]

      protocol:http https ftp file ed2k 服务器域名/ip  资源地址

    urllib(包 四个模块 urllib.request urllib.error urllib.parse urllib.robotparer)

      rullib.request.urlopen(url,data,timeout,capath...)

    import urllib.request
    
    response = rullib.request.urlopen('http://www.baidu.com')
    html = response.read() #二进制
    html= html.decode("utf-8")
    print(html)
    
    #下载一只猫 placekitten.com
    import urllib.request
    
    reponse = urllib.request.urlopen('placekitten.com/g/500/600')
    #req = urllib.request.Request('placekitten.com/g/500/600')
    #response = req.urlopen(req)
    
    response.geturl()
    response.info()
    response.getcode()
    
    cat_img = response.read()
    with open('cat_500_600.jpg',’wb') as f:
        f.write(cat_img)
    

      

    #有道翻译
    #network Method(GET POST) preview
    #Headers 
        #Remote Address: Request URL: Request Method:
        #Request Headers(客户端发送请求的header User-Agent是否非人类访问)
        #Form Data:
    #隐藏 检查headers > user agent
    #ip访问频率 1减少频率 2使用代理 import urllib.request import urllib.parse import json
    import time
    while True:
      content = input("请输入需要翻译的内容(输入'q!'退出程序)")   if content == 'q!'
        break   url = "http://fanyi.youdao.com..."   data = {}   data['type'] = 'AUTO'   data['i'] ='i love'   data['doctype'] = 'json'   data['xmlVersion'] = '1.6'   data['keyfrom'] = content   data['ue'] = 'UTF-8'   data['typoResult'] = 'true'   data = urllib.parse.urlencode(data).encode('utf-8')

      '''1
      head = {}
      head['User-Agent'] = 'Mozilla/5.0...'   response = urllib.request.urlopen(url, data, head)
      '''

      req = urllib.request.Request(url,data)
      req.add_header('User-agent','Mozilla/5.0') '''代理
      1参数是一个字典{'类型':'代理ip:端口号'}
       proxy_support =urllib.request.ProxyHandler({})
      2定制创建一个opener
       opener = urllib.request.build_opener(proxy_support)
    3a安装opener urllib.request.install_opener(opener)
    3b调用opner opener.open(url)
      '''   html = req.read().decode('utf-8')   #json轻量级数据交换格式   target = json.loads(html)   #type(target) dict   print("翻译结果:"+target['translateResult'][0][0]['tgt'])
      time.sleep(5)
    #测试代理
    import urllib.request
    import random
    
    url = 'http://www.whatismyip.com.tw'
    ip_list = ['','','']
    
    proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
    opener = urllib.request.build_opener(proxy_support)
    opener.addheaders = [('User-Agent':'Mozzilla')]
    
    urllib.request.install_opener(open)
    response = urllib.request.urlopen(url)
    html = response.read().decode('utf-8')
    

      

  • 相关阅读:
    数字图像-概述
    Python-python打包编译成pyd或者.so,保护代码。
    计算机组成原理-概述
    9大开源云管理平台(CMP)
    计算机组成原理-CPU-CPU知识科普:秒懂主频、核心、线程、缓存、架构详解
    svn git协同管理
    DEVTMPFS
    关于flash擦除的方法
    SQLServer强制保存
    360极速浏览器无法正确getHours
  • 原文地址:https://www.cnblogs.com/echoshao/p/6535488.html
Copyright © 2011-2022 走看看