zoukankan      html  css  js  c++  java
  • python爬虫笔记01

    1.urllib库中request,parse的学习

    1.1 简单的请求页面获取,并下载到本地 request的使用

    from urllib import request
    
    # 获取此网页的demout
    resp = request.urlopen('http://www.baidu.com')
    
    # 读出10个字符
    # 结果为 b'<!DOCTYPE '  b代表bytes 是一个字节流
    # '<!DOCTYPE ' 包括空格 正好十个字符
    # print(resp.read(10))
    
    # 读取一行
    # 结果为 b'<!DOCTYPE html>
    '
    # 
     代表换行
    # ()里面可以写读几行
    # print(resp.readline())
    
    # 全部读取  二者对应的返回类型不同
    # <class 'list'> readlines
    # <class 'bytes'> read
    # print(type(resp.readlines()))
    # print(type(resp.read()))
    
    
    # 下载到本地文件夹 
    request.urlretrieve('http://www.baidu.com', 'baidu.html')

    1.2 parse的使用

    1.2.1 解决中文与码的对应问题

    例:中文变成码 name=%E9%AB%98%E8%BE%BE 这些带%和中文之间的转换

     1 # 主要解决网址中中文是解析不了的问题
     2 
     3 from urllib import request
     4 from urllib import parse
     5 
     6 # 中文变成码 name=%E9%AB%98%E8%BE%BE&age=21
     7 
     8 # urlencode函数 把字典转化为字符串
     9 params = {'name': '高达', 'age': '21'}
    10 result = parse.urlencode(params)
    11 # 结果为
    12 # <class 'str'>
    13 # name=%E9%AB%98%E8%BE%BE&age=21
    14 print(type(result))
    15 print(result)
    16 
    17 # 如果直接在网址写中文会报错,虽然我们看到的是中文但是实际上是中文对应的码
    18 # resp = request.urlopen('https://www.baidu.com/s?wd=刘德华')
    19 
    20 
    21 # 写出一个字典
    22 params = {'wd': '刘德华'}
    23 print(params)
    24 # 转化成正确的网站上的格式
    25 qs = parse.urlencode(params)
    26 print(qs)
    27 # 这样请求就能成功
    28 resp = request.urlopen('https://www.baidu.com/s?'+qs)
    29 # print(resp.readlines())
    30 # parse.parse_qs函数 还原解码
    31 result = parse.parse_qs(qs)
    32 # 结果为
    33 # <class 'dict'>
    34 # {'wd': ['刘德华']}
    35 print(type(result))
    36 print(result)

    1.2.2 解析网址

    # 解析网址
    from urllib import parse
    
    
    def test():
        # 测试用的 url
        url = 'http://www.baidu.com/s;sssas?wd=python&username=abc#1'
        url1 = 'http://www.baidu.com/s?wd=python&username=abc#1'
        url2 = 'https://api.bilibili.com/x/v2/reply?callback=jQuery17204184039578913048_1580701507886&jsonp=jsonp&pn=1&type=1&oid=70059391&sort=2&_=1580701510617'
        result1 = parse.urlparse(url)
        # <class 'urllib.parse.ParseResult'>
        print(type(result1))
        # 结果为 ParseResult(scheme='http', netloc='www.baidu.com', path='/s', params='sssas', query='wd=python&username=abc',
        # fragment='1')
        # print(result1)
        # print('scheme:', result1.scheme)
        # print('netloc:', result1.netloc)
        # print('path: ', result1.path)
        # print('params: ', result1.params)
        # print('query: ', result1.query)
        # print('fragment: ', result1.fragment)
        result2 = parse.urlsplit(url)
        print(result2)
        # urlparse函数与 urlsplit函数的对比
        # urlparse函数多一个params
        result1 = parse.urlparse(url1)
        result2 = parse.urlsplit(url1)
        print(result1)
        print(result2)
        # result3 = parse.urlparse(url2)
        # print('r3:', result3.query)
    
    
    # 格式化输出函数
    def myself_urlparse(url):
        result = parse.urlparse(url)
        print('scheme:', result.scheme)
        print('netloc:', result.netloc)
        print('path: ', result.path)
        print('params: ', result.params)
        print('query: ', result.query)
        print('fragment: ', result.fragment)
    
    
    def myself_urlsplit(url):
        result = parse.urlsplit(url)
        print('scheme:', result.scheme)
        print('netloc:', result.netloc)
        print('path: ', result.path)
        print('query: ', result.query)
        print('fragment: ', result.fragment)
    
    
    if __name__ == '__main__':
        test()

     1.3 使用代理IP

    # ip代理
    from urllib import request
    # 没有代理是用本地ip
    url = 'http://www.baidu.com'
    resp = request.urlopen(url)
    # print(resp.read())
    
    # 使用代理
    handler = request.ProxyHandler({'http': '60.170.234.221:65309'})
    opener = request.build_opener(handler)
    resp = opener.open(url)
    print(resp.read())

    1.4CookieJar

    cokkie信息可以直接写在headers请求头里面,也可以创建CookieJar对象

    # cookie信息
    from urllib import request
    from http.cookiejar import CookieJar
    from urllib import parse
    # 登录
    
    cookiejar = CookieJar()
    handler = request.HTTPCookieProcessor(cookiejar)
    opener = request.build_opener(handler)
    # 最简单的就是在handers中写cookie
    handers = {
       'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
    }
    
    data = {
        'login-username': '15932639701',
        'login-passwd': 'w5153300'
    }
    url = 'https://passport.bilibili.com/login'
    
    req = request.Request(url, data=parse.urlencode(data).encode('utf-8'), headers=handers, method='GET')
    # 自动保存cokie信息
    opener.open(req)
    
    # 访问个人主页
    url1 = 'https://space.bilibili.com/141417102'
    
    # 使用之前的opener
    req = request.Request(url1, headers=handers)
    resp = opener.open(url1)
    # 写入本地
    with open('a.html', 'w', encoding='utf-8')as fp:
        fp.write(resp.read().decode('utf-8'))

    2.1 requests的使用 它是对urllib的再次封装,它们使用的主要区别:
    requests可以直接构建常用的get和post请求并发起,urllib一般要先构建get或者post请求,然后再发起请求。

    import requests
    
    resp = requests.get("https://www.baidu.com/")
    #
    #
    # 返回响应内容 源码 返回对象str 有乱码就要decode解码
    # print(type(resp.text))
    # print(resp.text.decode('utf-8'))
    
    # 返回响应内容 源码 返回对象bytes
    # print(type(resp.content))
    # print(resp.content)
    
    #
    print(resp.url)
    # https://www.baidu.com/
    print(resp.encoding)
    # ISO-8859-1
    print(resp.status_code)#
    200
    print(resp.headers)
    print(resp.cookies)
    import requests
    
    # 写入请求头
    
    # params = {
    #     'wd': '中国'
    # }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
    }
    resp = requests.get("https://www.baidu.com/s?wd=%E4%B8%AD%E5%9B%BD&rsv_spt=1&rsv_iqid=0xc673aab900009518&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=56060048_11_pg&ch=6&rsv_enter=0&rsv_dl=tb&rsv_sug3=4&rsv_sug1=2&rsv_sug7=101&prefixsug=%25E4%25B8%25AD%25E5%259B%25BD%2520&rsp=1&inputT=5214&rsv_sug4=5933",headers=headers)
    
    with open('a.html', 'w', encoding='utf-8')as fp:
        fp.write(resp.content.decode('utf-8'))
        print(resp.url)
    import requests
    
    # 使用代理ip
    proxy = {
        'http': 'http://47.100.124.14/'
    }
    resp = requests.get('https://www.baidu.com', proxies=proxy)
    print(resp.content.decode('utf-8'))
    import requests
    resp = requests.get("https://www.baidu.com/")
    # 查看cookies信息
    print(resp.cookies.get_dict())
    session = requests.Session()
    # 处理不信任的SSL证书
    # 网站是https需要
    # 出现错误是因为这个网站不是这个证书
    import requests
    resp = requests.post("https://www.baidu.com/", verify=False)

    3.1 XPath

    XPath语法

     补充 :text()是提取该标签下的文本

     图片来自于https://www.w3school.com.cn/xpath/xpath_syntax.asp

    import requests
    from lxml import html
    
    etree = html.etree
    parser = etree.HTMLParser(encoding='utf-8')
    html = etree.parse('a.html', parser=parser)
    divs = html.xpath('//div')
    # 返回一个列表
    for div in divs:
        print(etree.tostring(div, encoding='utf-8').decode('utf-8'))
    import requests
    from lxml import html
    
    etree = html.etree
    url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_1.html'
    headers = {
       'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
       'Referer': 'https://movie.douban.com/'
    }
    resp = requests.get(url, headers=headers)
    # print(text)
    text = resp.content.decode('gbk')
    # print(text)
    html = etree.HTML(text)
    # []里面@XXX=XXX是寻找特定的属性,在/后面@属性是要这个值的
    urls = html.xpath("//table[@class='tbspan']//a/@href")
    # nead_urls = []
    for url in urls:
        # nead_urls.append('https://www.dytt8.net'+url)
        print('https://www.dytt8.net'+url)
    import requests
    from lxml import html
    
    etree = html.etree
    
    # 请求头  网站url
    url = 'https://movie.douban.com/cinema/nowplaying/langfang/'
    headers = {
       'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
       'Referer': 'https://movie.douban.com/'
    }
    resp = requests.get(url,headers=headers)
    # text str content bytes
    # text 解码过的
    # print(resp.content.decode('utf-8'))
    # print(resp.text)
    
    # 转化为html 对象 <class 'lxml.etree._Element'>
    html = etree.HTML(resp.text)
    print(type(html))
    ul = html.xpath("//ul[@class='lists']")[0]
    # print(ul)
    # print(etree.tostring(ul,encoding='utf-8').decode('utf-8')
    # 取出li标签存入lis
    lis = ul.xpath("./li")
    # print(etree.tostring(li,encoding='utf-8').decode('utf-8'))
    a = html.xpath("//div[@id='upcoming']//a[@data-psource='title']")
    for x in a:
        name = x.xpath('@title')
        href = x.xpath('@href')
        print(href)
        print(name)
    # for li in lis:
    #     # 取出@data-title属性的值
    #     name = li.xpath("@data-title")
    #     print(name)
    #     # 取出图片链接
    #     img = li.xpath(".//img/@src")
    #     print(img)

    3.2 正则表达式

    import re
    import requests
    
    
    def parse_page(url):
        headers = {
            'User-Agent': 'Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 79.0.3945.130Safari / 537.36'
        }
        resp = requests.get(url,headers)
        text = resp.text
        # re.DOTALL .也可以匹配换行符
        titles = re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
        print(titles)
    
    
    def main():
        # url ='https://www.gushiwen.org/default_1.aspx'
        for x in range(1, 10):
            url ='https://www.gushiwen.org/default_%s.aspx' % x
            parse_page(url)
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    [TJOI2019]大中锋的游乐场——最短路+DP
    [TJOI2019]甲苯先生的滚榜——非旋转treap
    [TJOI2019]甲苯先生的字符串——矩阵乘法+递推
    [TJOI2019]唱、跳、rap和篮球——NTT+生成函数+容斥
    [ZJOI2020]字符串
    Ubuntu 20.04 工作区小记
    2020省选犯傻记
    寒假到省选的一些笔记
    AtCoder tokiomarine2020 题解
    [CF1336E]Chiori and Doll Picking
  • 原文地址:https://www.cnblogs.com/gonT-iL-evoL-I/p/12258171.html
Copyright © 2011-2022 走看看