zoukankan      html  css  js  c++  java
  • python爬虫--基础入门1

    一个最简单的爬虫

    from urllib.request import urlopen
    
    url = 'http://www.baidu.com'
    
    # 发送请求
    response = urlopen(url)
    
    # 读取内容
    info = response.read()
    # 打印内容
    #print(info.decode())
    
    # 打印状态码
    print(response.getcode())
    print('*'*50)
    # 打印真实url
    print(response.geturl())
    print('*'*50)
    # 打印响应头
    print(response.info())
    

    request的使用

    from urllib.request import urlopen
    from urllib.request import Request
    from fake_useragent import UserAgent
    url = 'https://www.baidu.com'
    ua = UserAgent()
    headers = {
        "User-Agent": ua.chrome
    }
    
    request = Request(url,headers=headers)
    print(request.get_header('User-agent'))
    
    response = urlopen(request)
    
    info = response.read()
    
    print(info.decode())
    

    get请求编码转换

    get请求时,往往在url上需要添加参数,但是如果参数值是中文的话会出现请求报错的情况,我们就需要将中转换编码才行

    from urllib.request import urlopen
    from urllib.request import Request
    from fake_useragent import UserAgent
    from urllib import parse
    # 单个参数
    #url = 'https://www.baidu.com/s?wd={}'.format(parse.quote('火狐'))
    
    # 如果url中有多个参数,可以使用urlencode
    args = {
        'wd':'火狐',
        'id':'utf-8'
    }
    url = 'https://www.baidu.com/s?'+parse.urlencode(args)
    print(url)
    ua = UserAgent()
    headers = {
        "User-Agent": ua.chrome
    }
    print(url)
    request = Request(url,headers=headers)
    #print(request.get_header('User-agent'))
    
    response = urlopen(request)
    
    info = response.read()
    
    print(info.decode())
    

    Post请求

    from urllib.request import Request,urlopen
    from urllib.parse import urlencode
    from fake_useragent import UserAgent
    import ssl
    
    
    url = 'https://www.maguangyi.top/tzsc/login.php'
    
    from_data = {
        'pwd' : '123456'
    }
    
    headers = {
        'User-Agent' : UserAgent().chrome
    }
    
    f_data = urlencode(from_data)
    
    request = Request(url,data=f_data.encode(),headers=headers)
    
    # 忽略ssl证书验证
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    
    response = urlopen(request,context=ctx)
    
    print(response.read().decode())
    
  • 相关阅读:
    Python 文件操作
    Python 操作 sqlite
    Python中的random模块
    Linux系统下的/etc/nsswitch.conf文件
    Python 列表/元组/字典总结
    快斗之翼:python2的print和python3的print()
    田小计划:图解Python深拷贝和浅拷贝
    Python 自省指南
    Python运算符优先级
    tc: 模拟网络异常的工具
  • 原文地址:https://www.cnblogs.com/maguangyi/p/14194594.html
Copyright © 2011-2022 走看看