zoukankan      html  css  js  c++  java
  • scrapy爬取伯乐在线文章

    创建爬虫工程

    (p3scrapy) [vagrant@reboot vagrant]$ scrapy startproject ArticleSpider
    You can start your first spider with:
        cd ArticleSpider
        scrapy genspider example example.com
    (p3scrapy) [vagrant@reboot ArticleSpider]$ scrapy genspider jobbole blog.jobbole.com
    

    完整项目结构

    (p3scrapy) [vagrant@reboot ArticleSpider]$ tree .
    .
    ├── ArticleSpider
    │   ├── images
    │   │   └── full
    │   ├── __init__.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   ├── jobbole.py
    ├── main.py
    └── scrapy.cfg
    

    表结构

    CREATE TABLE `article` (
      `title` varchar(200) NOT NULL,
      `create_date` date DEFAULT NULL,
      `url` varchar(300) NOT NULL,
      `url_object_id` varchar(50) NOT NULL,
      `front_image_url` varchar(300) DEFAULT NULL,
      `front_image_path` varchar(200) DEFAULT NULL,
      `praise_nums` int(11) DEFAULT NULL,
      `fav_nums` int(11) DEFAULT NULL,
      `comment_nums` int(11) DEFAULT NULL,
      `tags` varchar(200) DEFAULT NULL,
      `content` longtext NOT NULL,
      PRIMARY KEY (`url_object_id`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
    

    jobbole.py

    # -*- coding: utf-8 -*-
    import scrapy, re
    from scrapy.http import Request
    from urllib import parse
    from ArticleSpider.items import JobBoleArticleItem
    from ArticleSpider.utils.common import get_md5
    from datetime import datetime
    from scrapy.loader import ItemLoader
    
    from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader
    class JobboleSpider(scrapy.Spider):
        name = 'jobbole'
        allowed_domains = ['blog.jobbole.com']
        start_urls = ['http://blog.jobbole.com/all-posts/']
    
        def parse(self, response):
            """
            1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析
            2. 获取下一页的url并交给scrapy进行下载, 下载完成后交给parse
            :param response:
            :return:
            """
            post_nodes = response.css("#archive .floated-thumb .post-thumb a")
            for post_node in post_nodes:
                image_url = post_node.css("img::attr(src)").extract_first("")
                post_url = post_node.css("::attr(href)").extract_first("")
                yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail)
            # 提取下一页并进行下载
            next_urls= response.css(".next.page-numbers::attr(href)").extract_first("")
            if next_urls:
                yield Request(url=parse.urljoin(response.url, next_urls), callback=self.parse)
    
        def parse_detail(self, response):
            # article_item = JobBoleArticleItem()
            # # 提取文章的具体字段
            # # re_selector = response.xpath("/html/body/div[3]/div[3]/div[1]/div[1]/h1")
            # # re2_selector = response.xpath('//*[@id="post-110287"]/div[1]/h1')
            # front_image_url = response.meta.get("front_image_url", "") # 文章封面图
            # title = response.xpath('//*[@class="entry-header"]/h1/text()').extract_first("")
            # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first("").strip().replace('·','').strip()
            # praise_nums = int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract_first(""))
            # fav_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0]
            # match_re = re.match(".*?(d+).*", fav_nums)
            # if match_re:
            #     fav_nums = int(match_re.group(1))
            # else:
            #     fav_nums = 0
            # comment_nums = response.css('a[href="#article-comment"] span::text').extract_first("")
            # comment_re = re.match(".*?(d+).*", comment_nums)
            # comment_nums = int(comment_re.group(1)) if comment_re else 0
            # # if comment_re:
            # #     comm_nums = int(comment_re.group(1))
            # # else:
            # #     comm_nums = 0
            # content = response.xpath('//div[@class="entry"]').extract()[0]
            #
            # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
            # tag_list = [ element for element in tag_list if not element.strip().endswith("评论")]
            # tags = ",".join(tag_list)
            # article_item['url_object_id'] = get_md5(response.url)
            # article_item['title'] = title
            # article_item['url'] = response.url
            # try:
            #     create_date = datetime.strptime(create_date, "%Y/%m/%d").date()
            # except Exception as e:
            #     print(e.args)
            #     create_date = datetime.now().date()
            # article_item['create_date'] = create_date
            # article_item['front_image_url'] = [front_image_url]
            # article_item['praise_nums'] = praise_nums
            # article_item['comment_nums'] = comment_nums
            # article_item['fav_nums'] = fav_nums
            # article_item['tags'] = tags
            # article_item['content'] = content
    
            # 通过item loader加载item
            front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
            item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
            item_loader.add_css("title", ".entry-header h1::text")
            item_loader.add_value("url", response.url)
            item_loader.add_value("url_object_id", get_md5(response.url))
            item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
            item_loader.add_value("front_image_url", [front_image_url])
            item_loader.add_css("praise_nums", ".vote-post-up h10::text")
            item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
            item_loader.add_css("fav_nums", ".bookmark-btn::text")
            item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
            item_loader.add_css("content", "div.entry")
    
            article_item = item_loader.load_item()
            yield article_item
    

    items.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    from scrapy.loader import ItemLoader
    from scrapy.loader.processors import MapCompose, TakeFirst, Join
    import datetime
    import re
    import scrapy
    
    
    class ArticlespiderItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        pass
    
    class ArticleItemLoader(ItemLoader):
        #自定义itemloader
        default_output_processor = TakeFirst()
    
    def date_convert(value):
        try:
            create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
    
        return create_date
    
    
    def get_nums(value):
        match_re = re.match(".*?(d+).*", value)
        if match_re:
            nums = int(match_re.group(1))
        else:
            nums = 0
    
        return nums
    def remove_comment_tags(value):
        #去掉tag中提取的评论
        if "评论" in value:
            return ""
        else:
            return value
    
    def return_value(value):
        return value
    
    class JobBoleArticleItem(scrapy.Item):
        # title = scrapy.Field()
        # create_date = scrapy.Field()
        # url = scrapy.Field()
        # front_image_url = scrapy.Field()
        # front_image_path = scrapy.Field()
        # praise_nums = scrapy.Field()
        # fav_nums = scrapy.Field()
        # comment_nums = scrapy.Field()
        # tags = scrapy.Field()
        # content = scrapy.Field()
        # url_object_id = scrapy.Field()
    
        title = scrapy.Field()
        create_date = scrapy.Field(
            # input_processor=MapCompose(date_convert),
        )
        url = scrapy.Field()
        url_object_id = scrapy.Field()
        front_image_url = scrapy.Field(
            output_processor=MapCompose(return_value)
        )
        front_image_path = scrapy.Field()
        praise_nums = scrapy.Field(
            input_processor=MapCompose(get_nums)
        )
        comment_nums = scrapy.Field(
            input_processor=MapCompose(get_nums)
        )
        fav_nums = scrapy.Field(
            input_processor=MapCompose(get_nums)
        )
        tags = scrapy.Field(
            input_processor=MapCompose(remove_comment_tags),
            output_processor=Join(",")
        )
        content = scrapy.Field()
        def get_insert_sql(self):
            # insert_sql = """
            #     insert into article(title, url, create_date, fav_nums)
            #     VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(fav_nums)
            # """
            # params = (self["title"], self["url"], self["create_date"], self["fav_nums"])
            insert_sql = """
                insert into article(title, url, create_date, fav_nums, url_object_id, praise_nums,comment_nums,tags, content, front_image_url, front_image_path)
                VALUES (%s, %s, %s, %s,%s, %s, %s, %s,%s, %s, %s)
            """
            params = (self['title'], self['url'],self['create_date'],self['fav_nums'], self['url_object_id'], self['praise_nums'], self['comment_nums'], self['tags'], self['content'], self['front_image_url'], self['front_image_path'])
    
            return insert_sql, params
    
    
    

    pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    from scrapy.pipelines.images import ImagesPipeline
    import codecs, json
    from scrapy.exporters import JsonItemExporter
    
    import MySQLdb
    import MySQLdb.cursors
    from twisted.enterprise import adbapi
    class ArticlespiderPipeline(object):
        def process_item(self, item, spider):
            return item
    
    class JsonWithEncodingPipeline(object):
        # 自定义json文件的导出
        """写入json文件"""
        def __init__(self):
            self.file = codecs.open('article.json', 'w', encoding="utf-8")
    
        def process_item(self, item, spider):
            lines = json.dumps(dict(item), ensure_ascii=False) + '
    '
            self.file.write(lines)
            return item
    
        def spider_closed(self,spider):
            self.file.close()
    
    class MysqlPipeline(object):
        # 采用同步的方式写入mysql数据库
        def __init__(self):
            self.conn = MySQLdb.connect('127.0.0.1', 'root', '123456', 'articlespider', charset="utf8", use_unicode=True)
            self.cursor = self.conn.cursor()
    
        def process_item(self, item, spider):
            insert_sql = """
                insert into article(title, url, create_date, fav_nums, url_object_id, front_image_path, praise_nums,comment_nums,tags,content,front_image_url)
                VALUES (%s, %s, %s, %s,%s, %s, %s, %s,%s, %s, %s)
            """
            self.cursor.execute(insert_sql, (item['title'], item['url'],item['create_date'],item['fav_nums'], item['url_object_id'], item['front_image_path'],item['praise_nums'],item['comment_nums'],item['tags'],item['content'],item['front_image_url']))
            self.conn.commit()
    
    class MysqlTwistedPipeline(object):
        # 采用异步的方式写入mysql数据库
        def __init__(self, dbpool):
            self.dbpool = dbpool
    
        @classmethod
        def from_settings(cls, settings):
            dbparms = dict(
                host=settings["MYSQL_HOST"],
                user=settings["MYSQL_USER"],
                password=settings["MYSQL_PASSWORD"],
                db=settings["MYSQL_DBNAME"],
                charset='utf8',
                cursorclass=MySQLdb.cursors.DictCursor,
                use_unicode=True
            )
    
            dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
            return cls(dbpool)
    
        def process_item(self, item, spider):
            # 使用twisted将mysql插入变成异步执行
            query = self.dbpool.runInteraction(self.do_insert, item)
            # 处理异常
            query.addErrback(self.handle_error)
    
        def handle_error(self, failure):
            # 处理异步插入的异常
            print(failure)
    
        def do_insert(self, cursor, item):
            # 执行具体的插入逻辑
            # insert_sql = """
            #     insert into article(title, url, create_date, fav_nums, url_object_id, front_image_path, praise_nums,comment_nums,tags, content, front_image_url)
            #     VALUES (%s, %s, %s, %s,%s, %s, %s, %s,%s, %s, %s)
            # """
            # cursor.execute(insert_sql, (item['title'], item['url'],item['create_date'],item['fav_nums'], item['url_object_id'], item['front_image_path'], item['praise_nums'], item['comment_nums'], item['tags'], item['content'], item['front_image_url']))
            insert_sql, params = item.get_insert_sql()
            cursor.execute(insert_sql, params)
    
    
    class JsonItemExporterPipeline(object):
        # 调用scrapy提供的json export导出json文件
        def __init__(self):
            self.file = codecs.open('articleport.json', 'wb')
            self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
            self.exporter.start_exporting()
    
        def close_spider(self, spider):
            self.exporter.finish_exporting()
            self.file.close()
    
        def process_item(self, item, spider):
            self.exporter.export_item(item)
            return item
    
    
    class ArticleImagePipeline(ImagesPipeline):
        # 自定义图片下载
        # def item_completed(self, results, item, info):
        #     for ok, value in results:
        #         image_file_path = value['path']
        #     item['front_image_path'] = image_file_path
        #     return item
        def item_completed(self, results, item, info):
            print(item)
            if "front_image_url" in item:
                for ok, value in results:
                    image_file_path = value["path"]
                    print(image_file_path)
                item["front_image_path"] = image_file_path
    
            return item
    

    settings.py

    添加如下

    MYSQL_HOST = "127.0.0.1"
    MYSQL_DBNAME = "articlespider"
    MYSQL_USER = "root"
    MYSQL_PASSWORD = "123456"
    ROBOTSTXT_OBEY = False
    import sys, os
    BASE_DIR = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
    sys.path.insert(0, os.path.join(BASE_DIR, "ArticleSpider"))
    
    ITEM_PIPELINES = {
       # 'ArticleSpider.pipelines.ArticlespiderPipeline': 300,
        # "scrapy.pipelines.images.ImagesPipeline": 1,   # scrapy自带的图片下载组件
        # 'ArticleSpider.pipelines.JsonWithEncodingPipeline': 2,   # 自定义保存到json文件
        # 'ArticleSpider.pipelines.JsonItemExporterPipeline': 2,  # 使用scrapy自带的保存到json文件组件
        # 'ArticleSpider.pipelines.MysqlPipeline': 2,   # 同步保存模式
        'ArticleSpider.pipelines.MysqlTwistedPipeline': 2,  # 异步保存模式
    
        'ArticleSpider.pipelines.ArticleImagePipeline': 1,  # 自定义图片下载组件
    }
    # 图片下载
    IMAGES_URLS_FIELD = "front_image_url"
    project_dir = os.path.abspath(os.path.dirname(__file__))
    IMAGES_STORE = os.path.join(project_dir, 'images')
    DOWNLOAD_FAIL_ON_DATALOSS = False
    
    

    main.py

    启动程序

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    
    from  scrapy.cmdline import execute
    
    import sys, os
    sys.path.append(os.path.dirname(os.path.abspath(__file__)))
    # print(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    # execute(["scrapy", "crawl", "jobbole"])
    execute(["scrapy", "crawl", "lagou"])
    

    采集数据结果

    image.png

  • 相关阅读:
    流程控制语句
    第一周考点
    8.6
    8.5
    自用论文排版组合 = LyX2.2.2 + TeXLive2016
    解析几何图解
    概率论与数理统计图解.tex
    硕士研究生入学考试复试试卷答案.tex
    概率论与数理统计图解
    一月7日
  • 原文地址:https://www.cnblogs.com/guigujun/p/9932283.html
Copyright © 2011-2022 走看看