• Python爬虫re解析实战


    1.如下内容,欲练此功,必先....正则

    """古诗文爬取"""
    import requests
    import re
    
    
    def parse_page(url):
        rep = requests.get(
            url=url,
            headers={"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36"}
        )
        text = rep.text
        # re正则匹配古诗文标题
        titles = re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
        # re正则匹配古诗文朝代
        dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text)
        # re正则匹配古诗文作者
        authors = re.findall(r'<p class="source">.*?<a.*?><a.*?>(.*?)</a>', text, re.DOTALL)
        # re正则匹配古诗文内容
        content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL)
        contents = []
        # 清除诗文内容br标签
        for content in content_tags:
            data = re.sub(r"<.*?>", "", content)
            contents.append(data.strip())
        poems = []
        # zip参数可放置一个或多个迭代器,并把对应的元素打包成元组
        for value in zip(titles, dynasties, authors, contents):
            title, dynastie, author, content = value
            poem = {
                "title": title,
                "dynastie": dynastie,
                "author": author,
                "content": content
            }
            poems.append(poem)
        print(poems)
    
    
    def main():
        for x in range(1, 101):
            url = "https://www.gushiwen.org/default_{}.aspx".format(x)
            parse_page(url)
    
    
    if __name__ == '__main__':
        main()
    View Code
    """糗事百科笑话段子"""
    import requests
    import re
    
    
    def parse_detail(url):
        rep = requests.get(
            url=url,
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36"
            }
        )
        text = rep.content.decode(encoding="utf-8")
        # 匹配所有作者
        users_tag = re.findall(r'<divsclass="author clearfix">.*?<h2>(.*?)</h2>', text, re.DOTALL)
        users = []
        for user in users_tag:
            users.append(user.strip())
        # 匹配作者年龄
        ages = re.findall(r'<divsclass="author clearfix">.*?<div.*?>(.*?)</div>', text, re.S)
        # 匹配作者内容
        content_tags = re.findall(r'<divsclass="articlesblock.*?">.*?<span>(.*?)</span>', text, re.S)
        contents = []
        for content in content_tags:
            data = re.sub(r'<.*?>', "", content).strip()
            contents.append(data)
        user_infos = []
        for value in zip(users, ages, contents):
            user, age, content = value
            user_info = {
                "user": user,
                "age": age,
                "content": content,
            }
            user_infos.append(user_info)
        print(user_infos)
    
    
    def spider():
        for i in range(1, 14):
            url = "https://www.qiushibaike.com/text/page/{}/".format(i)
            parse_detail(url)
    
    
    if __name__ == '__main__':
        spider()
    View Code

    学习正则链接:http://www.runoob.com/regexp/regexp-tutorial.html

  • 相关阅读:
    Spring中Bean及@Bean的理解
    Visual Studio(VS)秘钥集合
    PLC不能初始化问题
    【原创】C# API 未能创建 SSL/TLS 安全通道 问题解决
    【原创】XAF CriteriaOperator 使用方式汇总
    【原创】.Net WebForm Calendar 日历控件常用方法
    【原创】XAF常用属性字段设置
    【记录】Windows 操作系统常用快捷命令
    【XAF】常见错误以及对应解决方法
    【原创】XAF ITreeNode+NonPersistent 使用方式
  • 原文地址:https://www.cnblogs.com/Guishuzhe/p/9841721.html
走看看 - 开发者的网上家园