zoukankan      html  css  js  c++  java
  • Python爬虫re解析实战

    1.如下内容,欲练此功,必先....正则

    """古诗文爬取"""
    import requests
    import re
    
    
    def parse_page(url):
        rep = requests.get(
            url=url,
            headers={"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36"}
        )
        text = rep.text
        # re正则匹配古诗文标题
        titles = re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
        # re正则匹配古诗文朝代
        dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text)
        # re正则匹配古诗文作者
        authors = re.findall(r'<p class="source">.*?<a.*?><a.*?>(.*?)</a>', text, re.DOTALL)
        # re正则匹配古诗文内容
        content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL)
        contents = []
        # 清除诗文内容br标签
        for content in content_tags:
            data = re.sub(r"<.*?>", "", content)
            contents.append(data.strip())
        poems = []
        # zip参数可放置一个或多个迭代器,并把对应的元素打包成元组
        for value in zip(titles, dynasties, authors, contents):
            title, dynastie, author, content = value
            poem = {
                "title": title,
                "dynastie": dynastie,
                "author": author,
                "content": content
            }
            poems.append(poem)
        print(poems)
    
    
    def main():
        for x in range(1, 101):
            url = "https://www.gushiwen.org/default_{}.aspx".format(x)
            parse_page(url)
    
    
    if __name__ == '__main__':
        main()
    View Code
    """糗事百科笑话段子"""
    import requests
    import re
    
    
    def parse_detail(url):
        rep = requests.get(
            url=url,
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36"
            }
        )
        text = rep.content.decode(encoding="utf-8")
        # 匹配所有作者
        users_tag = re.findall(r'<divsclass="author clearfix">.*?<h2>(.*?)</h2>', text, re.DOTALL)
        users = []
        for user in users_tag:
            users.append(user.strip())
        # 匹配作者年龄
        ages = re.findall(r'<divsclass="author clearfix">.*?<div.*?>(.*?)</div>', text, re.S)
        # 匹配作者内容
        content_tags = re.findall(r'<divsclass="articlesblock.*?">.*?<span>(.*?)</span>', text, re.S)
        contents = []
        for content in content_tags:
            data = re.sub(r'<.*?>', "", content).strip()
            contents.append(data)
        user_infos = []
        for value in zip(users, ages, contents):
            user, age, content = value
            user_info = {
                "user": user,
                "age": age,
                "content": content,
            }
            user_infos.append(user_info)
        print(user_infos)
    
    
    def spider():
        for i in range(1, 14):
            url = "https://www.qiushibaike.com/text/page/{}/".format(i)
            parse_detail(url)
    
    
    if __name__ == '__main__':
        spider()
    View Code

    学习正则链接:http://www.runoob.com/regexp/regexp-tutorial.html

  • 相关阅读:
    nginx php-fpm 输出php错误日志
    图解phpstorm常用快捷键
    Mysq性能分析 —— Genral log(普通日志)与 Slow log(慢速日式)
    Mac Terminal
    Git安装与配置
    Linux ulimit
    tcpdump
    Linux 基础
    TCP
    HTTP
  • 原文地址:https://www.cnblogs.com/Guishuzhe/p/9841721.html
Copyright © 2011-2022 走看看