zoukankan      html  css  js  c++  java
  • Scrapy抓取jobbole数据

    1、python版本3.6.1

    2、python编辑器:JetBrains PyCharm

    2、安装virtualenvwrapper-win

    pip3 install virtualenvwrapper-win

    3、创建项目

    1 mkvirtualenv spider_article
    2 pip install C:UsersCRDownloadsTwisted-17.5.0-cp36-cp36m-win_amd64.whl 
    3 pip install pypiwin32
    4 pip install -i https://pypi.douban.com/simple/ scrapy
    5 pip install mysqlclient
    6 pip install pillow

    4、现在项目存放位置:

      1、打开cmd

      2、workon  spider_article

      3、scrapy startproject ArticleSpider

      4、cd ArticleSpider

      5、scrapy genspider jobbole blog.jobbole.com

    5、ArticleSpider文件夹下面创建调试文件

    1 from scrapy.cmdline import execute
    2 
    3 import sys
    4 import os
    5 
    6 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
    7 execute(["scrapy","crawl","jobbole"])

    6、主要代码

    jobbole.py文件内容

     1 import scrapy
     2 import re
     3 import datetime
     4 from scrapy.http import Request
     5 from urllib import parse   #python2中import urlparse
     6 from scrapy.loader import ItemLoader
     7 
     8 
     9 from ArticleSpider.items import JobBoleArticleItem,ArticleItemLoader
    10 
    11 from ArticleSpider.utils.common import get_md5
    12 
    13 
    14 class JobboleSpider(scrapy.Spider):
    15     name = 'jobbole'
    16     allowed_domains = ['blog.jobbole.com']
    17     start_urls = ['http://blog.jobbole.com/all-posts/']
    18 
    19     def parse(self, response):
    20 
    21         '''
    22         1、获取文章列表页面中的文章url并交给scrapy下载后解析
    23         2、获取下一页的url并交给scrapy进行下载,下载完成后交给parse
    24         :param response:
    25         :return:
    26         '''
    27         #extract 一旦执行就会返回一个数组
    28         post_nodes = response.css("#archive .floated-thumb .post-thumb a")
    29         for post_node in post_nodes:
    30             image_url = post_node.css("img::attr(src)").extract_first("")
    31             #取出当前文章的域名
    32             #Request(url=post_url,callback=self.parse_detail)
    33             post_url = post_node.css("::attr(href)").extract_first("")
    34             #parse.urljoin      post_url如果没有域名就从response中提取域名防进行;如果post_url有域名,response就不会起作用
    35             yield Request(url=parse.urljoin(response.url,post_url),meta={"front_image_url":image_url}, callback=self.parse_detail)
    36 
    37         #提取下一页并交给scrapy进行下载
    38         next_urls = response.css(".next.page-numbers::attr(href)").extract_first("")
    39         if next_urls:
    40             yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse)
    41 
    42     def parse_detail(self,response):
    43         '''
    44         获取文章的具体字段
    45         :param response: 
    46         :return: 
    47         '''
    48         front_image_url = response.meta.get("front_image_url", "")  #文章封面图
    49         item_loader = ArticleItemLoader(item=JobBoleArticleItem(),response=response)
    50         item_loader.add_css("title",".entry-header h1::text")
    51         item_loader.add_value("url", response.url)
    52         item_loader.add_value("front_image_url",[front_image_url])
    53         item_loader.add_value("url_object_id",get_md5(response.url))
    54         item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
    55         item_loader.add_css("praise_nums", ".vote-post-up h10::text")
    56         item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
    57         item_loader.add_css("fav_nums", ".bookmark-btn::text")
    58         item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
    59         item_loader.add_css("content", "div.entry")
    60         article_item = item_loader.load_item()
    61         yield article_item

    2、items.py文件

     1 import datetime
     2 import re
     3 
     4 import scrapy
     5 from scrapy.loader import ItemLoader
     6 from scrapy.loader.processors import MapCompose,TakeFirst,Join
     7 
     8 
     9 class ArticlespiderItem(scrapy.Item):
    10     # define the fields for your item here like:
    11     # name = scrapy.Field()
    12     pass
    13 
    14 def date_convert(vlaue):
    15     try:
    16         create_date = datetime.datetime.strptime(vlaue,"%Y/%m/%d").date()
    17     except Exception as e:
    18         create_date = datetime.datetime.now().date()
    19     return create_date
    20 
    21 
    22 def get_nums(value):
    23     match_re = re.match(".*?(d+).*", value)
    24     if match_re:
    25         nums = int(match_re.group(1))
    26     else:
    27         nums = 0
    28     return  nums
    29 
    30 def remove_comment_tags(value):
    31     #去掉tag中提取的评论
    32     if "评论" in value:
    33         return ""
    34     else:
    35         return value
    36 
    37 def return_value(value):
    38     return value
    39 
    40 class ArticleItemLoader(ItemLoader):
    41     #自定义ItemLoader
    42     default_output_processor = TakeFirst()
    43 
    44 
    45 class JobBoleArticleItem(scrapy.Item):
    46     title = scrapy.Field()
    47     # title = scrapy.Field(
    48     #     input_processor = MapCompose(lambda x:x+'-jobbole')
    49     # )
    50     create_date = scrapy.Field(
    51         input_processor = MapCompose(date_convert),
    52         #output_processor = TakeFirst(),#只取第一个
    53     )
    54     url = scrapy.Field()
    55     url_object_id = scrapy.Field()
    56     front_image_url = scrapy.Field()
    57     front_image_path = scrapy.Field(
    58         input_processor=MapCompose(return_value)
    59     )
    60     praise_nums = scrapy.Field(
    61         input_processor = MapCompose(get_nums)
    62     )
    63     comment_nums = scrapy.Field(
    64         input_processor = MapCompose(get_nums)
    65     )
    66     fav_nums = scrapy.Field(
    67         input_processor = MapCompose(get_nums)
    68     )
    69     tags = scrapy.Field(
    70         input_processor=MapCompose(remove_comment_tags),
    71         output_processor=Join(",")
    72     )
    73     content = scrapy.Field()

    3、piplines.py文件

    import codecs
    import json
    
    from scrapy.pipelines.images import ImagesPipeline
    from scrapy.exporters import JsonItemExporter
    from twisted.enterprise import adbapi  #adbapi可以将MySQLdb的一些操作变成异步化的操作
    
    import MySQLdb
    import MySQLdb.cursors
    
    class ArticlespiderPipeline(object):
        def process_item(self, item, spider):
            return item
    
    class JsonWithEncodingPipeline(object):
        #自定义json文件的导出
        def __init__(self):
            self.file = codecs.open('article.json','w',encoding="utf-8")
    
        def process_item(self, item, spider):
            lines = json.dumps(dict(item),ensure_ascii=False) + '
    '
            self.file.write(lines)
            return item
    
        def spider_closed(self,spider):
            self.file.close()
    
    class MysqlPipeline(object):
        def __init__(self):
            self.conn = MySQLdb.connect("localhost","root","","article_spider",charset="utf8",use_unicode=True)
            self.cursor = self.conn.cursor()
    
        def process_item(self, item, spider):
            inset_sql = '''
                INSERT INTO  article (title,url,create_date,fav_nums)
                VALUES (%s,%s,%s,%s)
            '''
            self.cursor.execute(inset_sql,(item['title'],item['url'],item['create_date'],item['fav_nums']))
            self.conn.commit()
    
    class MysqlTwistedPipeline(object):
        def __init__(self,dbpool):
            self.dbpool = dbpool
    
        @classmethod
        def from_settings(cls,settings):
            dbparms = dict (
                host = settings["MYSQL_HOST"],
                db = settings["MYSQL_DBNAME"],
                user = settings["MYSQL_USER"],
                passwd = settings["MYSQL_PASSWORD"],
                charset = 'utf8',
                cursorclass = MySQLdb.cursors.DictCursor,
                use_unicode = True,
            )
            dbpool = adbapi.ConnectionPool("MySQLdb",**dbparms)
    
            return cls(dbpool)
    
        def process_item(self, item, spider):
           '''
           使用Twisted将mysql插入变成异步执行
           :param item:
           :param spider:
           :return:
           '''
           query = self.dbpool.runInteraction(self.do_insert,item)
           query.addErrback(self.handle_error,item,spider)#处理异常
    
        def handle_error(self,failure,item,spider):
            #处理异步插入的异常
            print(failure)
    
        def do_insert(self,cursor,item):
            #执行具体的插入
            inset_sql = '''
                INSERT INTO  article (title,url,create_date,fav_nums)
                VALUES (%s,%s,%s,%s)
            '''
            cursor.execute(inset_sql, (item['title'], item['url'], item['create_date'], item['fav_nums']))
    
    
    class JsonExporterPipleline(object):
        #调用scrapy提供的json export导出json文件
        def __init__(self):
            self.file = open('articleexport.json', 'wb')
            self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
            self.exporter.start_exporting()
    
        def close_spider(self, spider):
            self.exporter.finish_exporting()
            self.file.close()
    
    
        def process_item(self, item, spider):
            self.exporter.export_item(item)
            return item
    
    
    class ArticleImagePipline(ImagesPipeline):
        def item_completed(self, results, item, info):
            if "front_image_path" in item:
                for ok,value in results:
                    image_file_path = value["path"]
                item["front_image_path"] = image_file_path
    
            return item

    4、创建公共函数(位置存放ArticleSpider/utils/common.py)

     1 import hashlib
     2 
     3 def get_md5(url):
     4     if isinstance(url,str):
     5         url = url.encode("utf-8")
     6     m = hashlib.md5()
     7     m.update(url)
     8     return m.hexdigest()
     9 
    10 if __name__ == '__main__':
    11     print(get_md5("http://jobbole.com"))

    5、配置settings文件

    import os
    
    ITEM_PIPELINES = {
        'scrapy.pipelines.images.ImagesPipeline':1,#图片下载
        'ArticleSpider.pipelines.MysqlTwistedPipeline': 3,
    }
    IMAGES_URLS_FIELD = 'front_image_url'
    project_dir = os.path.abspath(os.path.dirname(__file__))
    IMAGES_STORE = os.path.join(project_dir,"images")  #指定图片存储路径
    
    
    MYSQL_HOST = 'localhost'
    MYSQL_DBNAME = 'article_spider'
    MYSQL_USER = 'root'
    MYSQL_PASSWORD = '' 
  • 相关阅读:
    俄罗斯方块游戏JavaScript代码
    NET工厂模式架构
    MVC之Session State性能
    配置模型详解
    Elasticsearch
    java中处理字符编码(网页与数据库)(转)
    如何设置一个activity透明(转)
    view和activity的区别(转)
    微擎系统搭建(转)
    Java Gradle入门指南之依赖管理(添加依赖、仓库、版本冲突) (转)
  • 原文地址:https://www.cnblogs.com/xb88/p/8324967.html
Copyright © 2011-2022 走看看