zoukankan      html  css  js  c++  java
  • 全站爬取cnblogs

    全站爬取cnblogs

    • 创建
    """
    # 1 scrapy startproject cnblogs_crawl
    # 2 scrapy genspider cnblogs www.cnblogs.com
    """
    
    • 代码演示
    """
    # -*- coding: utf-8 -*-
    import scrapy
    from cnblogs_crawl.items import CnblogsCrawlItem
    from scrapy.http import Request
    
    
    class CnblogsSpider(scrapy.Spider):
        name = 'cnblogs'
        allowed_domains = ['www.cnblogs.com']
        start_urls = ['http://www.cnblogs.com/']
    
        def parse(self, response):
            # print(response)
            div_list = response.css('.post_item')
            for div in div_list:
                item = CnblogsCrawlItem()
                title = div.css('h3>a::text').extract_first()
                # print(title)
                item['title'] = title
                url = div.css('h3>a::attr(href)').extract_first()
                # print(url)
                item['url'] = url
                author = div.css('.post_item_foot a::text').extract_first()
                # print(author)
                item['author'] = author
                desc = div.css('.post_item_summary::text').extract()[-1]
                # print(desc)
                item['desc'] = desc
    
                # yield item
                # 如果不写callback,爬完之后,就会执行parse_detail解析
    
                # 传参
                yield Request(url, callback=self.parse_detail, meta={'item':item})
    
                # 持久化
    
            next = response.css('div.pager a:last-child::attr(href)').extract_first()
            # print(next)
    
            # 继续爬取下一页内容
            yield Request('http://www.cnblogs.com/' + next)
    
    
        # 继续获取文章的真正的内容,因为解析方式可能不一样
        def parse_detail(self, response):
            # print(response)
            item = response.meta.get('item')
            content = response.css('#cnblogs_post_body').extract_first()
            if not content:
                content = response.css('content').extract_first()
            item['content'] = content
            yield item
    
    
    """
    
    • 鼠标执行
    """
    from scrapy.cmdline import execute
    
    execute(['scrapy', 'crawl', 'cnblogs'])
    """
    
    • 爬取数据持久化到数据库
    """
    import pymysql
    
    # 写入数据,持久化
    class CnblogsCrawlPipeline(object):
    
        def open_spider(self, spider):
            self.conn = pymysql.Connect(host='127.0.0.1', port=3306, db='cnblogs', user='root', password='123456')
    
        def process_item(self, item, spider):
            curser = self.conn.cursor()
            sql = '''insert into article (title, url, `desc`, content, author) values (%s, %s, %s, %s, %s)'''
            curser.execute(sql, args=(item['title'], item['url'], item['desc'], item['content'], item['author']))
            self.conn.commit()
    
    
        def close_spider(self, spider):
            self.conn.close()
    """
    
  • 相关阅读:
    运维IT必备程序安装包
    网络基础TCP三次握手四次挥手
    新建Weblogic域启动报BEA090403和BEA000386提示密码认证有问题
    Weblogic开发模式和生产产品模式互换
    web api 返回 去除双引号转义符
    freeswitch esl :Rejected by acl “loopback.auto“问题
    freeswitch SIP 服务器一些常用配置
    因果推断综述
    Django 项目配置拆分独立
    wayne编译支持k8s1.16+
  • 原文地址:https://www.cnblogs.com/yafeng666/p/12687899.html
Copyright © 2011-2022 走看看