zoukankan      html  css  js  c++  java
  • python--(爬虫-re模块)

    python--(爬虫-re模块)

    re模块四大核心功能:

    1.findall 查找所有,返回list

    import re
    lst = re.findall("m", "mai le fo len, mai ni mei!")
    print(lst) # ['m', 'm', 'm']
    
    lst = re.findall(r"d+", "5点之前. 你要给我5000")
    print(lst) # ['5' '5000']

    2.search 会进行匹配,但如果匹配到了第一个结果,就会返回这个结果,
    如果匹配不上search返回的则是None

    import re
    ret = re.search(r'd', '5点之前. 你要给我5000万').group()
    print(ret) # 5


    3. match 只能从字符串的开头进⾏匹配
    import re
    ret = re.match('a', 'abc').group()
    print(ret) # a


    4. finditer 和findall差不多. 只不过这时返回的是迭代器
    import re
    it = re.finditer("m", "mai le fo len, mai ni mei!")
    for el in it:
     print(el.group()) # 依然需要分组

        5.re模块相关操作

    import re
    
    #   split  切割. 按照正则切割.
    # lst = re.split(r"[ab]", "abcdefghahahehedebade")
    # print(lst)
    
    #   sub 替换.
    # result = re.sub("250", "__sb__", "alex250taibai250taihei250ritian250liuwei")
    # print(result)
    
    # result = re.subn("250", "__sb__", "alex250taibai250taihei250ritian250liuwei")
    # print(result)
    
    # obj = re.compile(r"d+")
    # lst = obj.findall("大阳哥昨天赚了5000块")
    # lst2 = obj.findall("银行流水5000, 花了6000")
    # print(lst)
    # print(lst2)
    
    
    
    # obj = re.compile(r"(?P<id>d+)(?P<zimu>e{3})")
    # ret = obj.search("abcdefg123456eeeee") # ((123456)(eee))
    # print(ret.group())
    # print(ret.group("id"))
    # print(ret.group("zimu"))
    
    
    # ret = re.findall('www.(baidu|oldboy).com', 'www.oldboy.com')
    # print(ret) # 这是因为findall会优先把匹配结果组⾥内容返回,如果想要匹配结果,取消权限即可
    # ret = re.findall('www.(?:baidu|oldboy).com', 'www.oldboy.com') # ?: 当前的()不分组
    # print(ret) # ['www.oldboy.com']
    
    
    # ret=re.split("sb","alexsbwusirsbtaibaisbliuwei")
    # print(ret)
    View Code

    爬虫重点:爬取豆瓣网站相关信息===>

    import re
    from urllib.request import urlopen  # 打开一个链接. 读取源代码
    import ssl
    # 干掉数字签名证书
    ssl._create_default_https_context = ssl._create_unverified_context
    
    
    def getPage(url):
        response = urlopen(url) # 和网页链接
        return response.read().decode('utf-8') # 返回正常的页面源代码. 一大堆HTML
    def parsePage(s): # s 是页面源代码
        ret = re.findall('<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>d+).*?'+
            '<span class="title">(?P<title>.*?)</span>'+
            '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>'+
            '(?P<comment_num>.*?)评价</span>', s, re.S)
        return ret # id,title, rating_num, comment_num
    
    def main(num):
        url = 'https://movie.douban.com/top250?start=%s&filter=' % num
        response_html = getPage(url) # response_html是页面源代码
        ret = parsePage(response_html)
        print(ret) # id,title, rating_num, comment_num
    
    count = 0
    for i in range(10): # 10
        main(count)
        count += 25
    方法一
    import re
    from urllib.request import urlopen  # 打开一个链接. 读取源代码
    import ssl
    # 干掉数字签名证书
    ssl._create_default_https_context = ssl._create_unverified_context
    
    
    def getPage(url):
        response = urlopen(url) # 和网页链接
        return response.read().decode('utf-8') # 返回正常的页面源代码. 一大堆HTML
    
    def parsePage(s):
        com = re.compile(
            '<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>d+).*?' +
            '<span class="title">(?P<title>.*?)</span>' +
            '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>' +
            '(?P<comment_num>.*?)评价</span>', re.S)
        ret = com.finditer(s)
        for i in ret:
            yield {
                "id": i.group("id"),
                "title": i.group("title"),
                "rating_num": i.group("rating_num"),
                "comment_num": i.group("comment_num"),
            }
    
    
    def main(num):
        url = 'https://movie.douban.com/top250?start=%s&filter='
        response_html = getPage(url)
        print(response_html)
        ret = parsePage(response_html)
        # print(ret)
        f = open("move_info7", "a", encoding="utf8")
        for obj in ret:
            print(obj)
            data = str(obj)
            f.write(data + "
    ")
    
    count = 0
    for i in range(10): # 10
        main(count)
        count += 25
    爬取并写入文件
  • 相关阅读:
    Eclipse背景颜色修改
    Android动画效果translate、scale、alpha、rotate详解
    代理上网的方法
    ubuntu系统使用SSH免密码登陆
    Git的思想和基本工作原理
    GitHub详细教程
    Ubuntu和Redhat(Debian)的差别
    T2: 一种能累积计算积分的EC2实例类型
    win server 2008 r2 iis+php 500错误内部服务器错误。
    从OTF字体文件里查找字体名称
  • 原文地址:https://www.cnblogs.com/konghui/p/9778289.html
Copyright © 2011-2022 走看看