zoukankan      html  css  js  c++  java
  • 使用xpath技术爬取段子网

    from lxml import etree
    import time
    import json
    import urllib.request
    item_list = []  # 创建一个列表存储获取的信息
    
    
    # 构造request对象
    def handler_request(url, page):
        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple
                                  WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
        }
        get_url = url + str(page)
        request = urllib.request.Request(url=get_url, headers=headers)
        return request
    
    
    # 解析获取的html文件
    def parse_content(content):
        # 生成对象
        tree = etree.HTML(content)
        article_list = tree.xpath('//main[@class="col-md-8 main-content"]/article')
        # 遍历article列表
        for article in article_list:
            # 获取标题
            title = article.xpath('.//div[@class="post-head"]/h1/a/text()')[0]
            # 获取内容
            text = article.xpath('.//div[@class="post-content"]/p/text()')
            text = '
    '.join(text)  # 将内容进行拼接
            item = {
                '标题': title,
                '内容': text,
            }
            item_list.append(item)
    
    
    def main():
        start_page = int(input("请输入查询起始页面:"))
        end_page = int(input("查询结束页面:"))
        url = "http://duanziwang.com/page/"
        for page in range(start_page, end_page+1):
            request = handler_request(url, page)
            try:
                content = urllib.request.urlopen(request).read().decode()
                parse_content(content)
            except:
                print("第%d页面爬取失败" % page)
        string = json.dumps(item_list, ensure_ascii=False)
        with open('duanzi.txt', "w", encoding='utf-8') as f:
            f.write(string)
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    38) 收集centos所有版本镜像下载地址
    37) 查看linux 操作系统为多少位
    php面向对象高级应用一
    php form表单的验证+提交到数据库
    php获取form表单数据
    php form表单的提交
    php form表单概念
    php日期和时间的应用
    php日期和时间函数
    php字符串函数操作实例(2)
  • 原文地址:https://www.cnblogs.com/nxrs/p/11365422.html
Copyright © 2011-2022 走看看