zoukankan      html  css  js  c++  java
  • urllib库

    from urllib import request
    form urllib import parse

    rsp = request.urlopen('http://www.baidu.com/') 获取url的页面
    print(rsp.getcode()) 获取页面的返回值 200为成功
    print(rsp.read()) rsp是一个句柄 用read()显示出来 rsp.read()返回的是bytes类型

    request.urlretrieve('http://www.baidu.com/','baidu.html') 将页面下载下来 下载到‘baidu.html’中

    用浏览器发送请求时,如果包含中文或其他字符 浏览器会自动编码 用代码发送请求时,必须要手动编码
    url = 'http://www.baidu.com/s' 如果直接发送url='http://www.baidu.com/s?wd=刘德华' 会识别不了
    qs = {'wd':'刘德华'}
    qs = parse.urlencode(qs) 使用parse.urlencode(qs)来编码使请求可以被识别
    url = url+'?'+qs
    rsp = request.urlopen(url)
    print(rsp.read().decode())

    date = {'name':'周超', 'age':'18', 'greet':'hello world'}
    date = parse.urlencode(date)
    print(type(date))
    date = parse.parse_qs(date) 使用parse.parse_qs(date)来解码之前编码过的数据
    print(date)

    from urllib import parse

    url = 'http://www.baidu.com./s?wd=python'
    result = parse.urlparse(url)

    result = parse.urlsplit(url) parse.urlparse(url)和parse.urlsplit(url)功能基本相同 都是用来解析url各组成部分 区别在于parse.urlsplit(url)没有params选项

    print(result)
    print('scheme:',result.scheme)
    print('netloc:',result.netloc)
    print('path:',result.path)
    print('params:',result.params)
    print('query:',result.query)
    print('fragment:',result.fragment)

    要增加请求头 需要requesrs.Request来实现
    url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400',
    'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='}
    date = {'first':'true',
    'pn':1,
    'kd':'python'}

    date = parse.urlencode(date).encode('utf-8') python3中默认的是unicode 这里需要bytes类型 unicode转bytes需要encode()
    rsp = request.Request(url,headers=headers,data=date,method='POST')
    res = request.urlopen(rsp)
    print(res.read().decode()) res.read()时bytes类型 需要解码成unicode 所以需要res.read().decode()

    7.设置代理
    from urllib import request
    url = 'http://httpbin.org/ip' http://httpbin.org/ip 可以查看ip地址等
    handler = request.ProxyHandler({'http':'183.129.244.17:45745'}) 通过 request.ProxyHandler({'http':'183.129.244.17:45745'}) 创建一个handler
    opener = request.build_opener(handler) 通过request.build_opener(handler) 创建opener
    res = opener.open(url)
    print(res.read())

    8.cookie
    from urllib import request
    url = 'https://study.163.com/my?from=study'
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400',
    'Referer':'https://chat.study.163.com/study-chat/index.htm?accid=s-1137692519&token=39b6d1cfc35c8e19c82ebdcee9e16fc3',
    'Cookie':'EDUWEBDEVICE=3451afbac1a440369a55941240853d9c; _f=1536210599096; _ntes_nnid=a7b4d27203bf9eb9f82ba5bf0b410392,1536210592051; _ntes_nuid=a7b4d27203bf9eb9f82ba5bf0b410392; 1137692519=1137692519; hasVolume=true; videoVolume=0.4; sideBarPost=651; hb_MA-BFF5-63705950A31C_u=%7B%22utm_source%22%3A%20%22cp-1025897964%22%2C%22utm_medium%22%3A%20%22share%22%2C%22utm_campaign%22%3A%20%22commission%22%2C%22utm_content%22%3A%20%22%22%2C%22utm_term%22%3A%20%22%22%2C%22promotional_id%22%3A%20%22%22%7D; hb_MA-BFF5-63705950A31C_source=www.baidu.com; __utmz=129633230.1536484261.12.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; UTM_CPS=1025897964|2; videoRate=1.5; NTESSTUDYSI=7cd66f6bf5784901af063378928d2990; __utmc=129633230; STUDY_UUID=9b6b2ce6-e746-40c3-8135-4e867730acd2; STUDY_SESS="Tai3FvilqwjtCbmXjAJcj2j/mTqoSZ/RcrB9loI9ZOFaoycTtiOAgkXM1tMEL8CKyt/Wtmb3LKGs26Kt/uDsVeWnuHUwcBws771wcLIKkoBwwsCzhJWJ/TEjwxcoD+74K50kS2lTpm9BIjRvbEFmBGxfxNLia6E4X+R3124aKEcAFhqsm7+DHVfJhiFOprz2"; STUDY_INFO=5600144693|2|1137692519|1536545905785; NETEASE_WDA_UID=1137692519#|#1521627640058; NTES_STUDY_YUNXIN_ACCID=s-1137692519; NTES_STUDY_YUNXIN_TOKEN=39b6d1cfc35c8e19c82ebdcee9e16fc3; videoResolutionType=2; __utma=129633230.1332926595.1536210590.1536546039.1536546039.16; utm="eyJjIjoiIiwiY3QiOiIiLCJpIjoiIiwibSI6IiIsInMiOiIiLCJ0IjoiIn0=|aHR0cHM6Ly9zdHVkeS4xNjMuY29tL215P2Zyb209c3R1ZHk="; __utmb=129633230.3.10.1536546039'}

    rsp = request.Request(url=url,headers=headers)
    req = request.urlopen(rsp)

    print(req.read().decode())

    with open('wangyiyun.html','w',encoding='utf-8') as f:
    f.write(req.read().decode())

  • 相关阅读:
    Django学习笔记
    禁用Win10自带截图工具快捷键(Shift+Win+S)
    Linux基础知识
    Ubuntu中配置Python虚拟环境Virtualenv
    PyCharm 格式化代码 常用快捷键
    你不得不知道的HashMap面试连环炮
    大型互联网公司分布式ID方案总结
    Java程序员必会常用Linux速查手册
    面试题:InnoDB中一棵B+树能存多少行数据?
    C语言:标准IO_fopen( )、fclose() ①
  • 原文地址:https://www.cnblogs.com/zhouchao123/p/9622892.html
Copyright © 2011-2022 走看看