zoukankan      html  css  js  c++  java
  • scrapy-jobbole伯乐案例

    settings.py 配置项目管道

    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
        # 300 代表优先级,优先级数量越小,优先级级别越大
       'bole.pipelines.BolePipeline': 300,
    }

    jobbole.py

    import scrapy
    from bole.items import BoleItem
    
    class JobboleSpider(scrapy.Spider):
        name = 'jobbole'
        allowed_domains = ['jobbole.com']
        # start_urls = ['http://www.jobbole.com/caijing/gsyw/']
    
        def start_requests(self):
            base_url = 'http://www.jobbole.com/caijing/gsyw/index_{}.html'
            for i in range(1, 33):
                url = base_url.format(i)
                yield scrapy.Request(url=url, callback=self.parse)
    
        def parse(self, response):
            href_list = response.xpath('//div[@class="list-item"]/div[@class="img"]/a/@href').extract()
            for href in href_list:
                href = "http://www.jobbole.com/caijing/gsyw/" + href.split('/')[-1]
                detail_request = scrapy.Request(url=href, callback=self.parse_detail)
                yield detail_request
    
            # next_page_url = response.xpath("//div[@id='layui-laypage-1']/a[@class='a1']/@href").extract()[1]
            # if next_page_url:
            #     next_page_url = 'http://www.jobbole.com/' + next_page_url
            #     yield scrapy.Request(url=next_page_url, callback=self.parse)
    
        def parse_detail(self, response):
            article_url = response.url
            title = response.xpath('//div[@class="article-head"]/h1/text()').extract_first()
            p_time = response.xpath('//div[@class="about"]/div[@class="date"]/span[1]/text()').extract_first().split(' ')[0]
            item = BoleItem()
            item['title'] = title
            item['p_time'] = p_time
            item['article_url'] = article_url
            yield item

    items.py

    import scrapy
    
    
    class BoleItem(scrapy.Item):
        # define the fields for your item here like:
        title = scrapy.Field()
        p_time = scrapy.Field()
        article_url = scrapy.Field()
        # pass

    bole_mysql.py

    """
    CREATE TABLE bole_data(
        id int primary key auto_increment,
        title varchar(100),
        p_time date,
        article_url varchar(100)) default charset=utf8mb4;
    """
    import pymysql
    
    
    class BoleMysql(object):
        # 初始化就是连接数据库
        def __init__(self):
            self.conn = pymysql.connect(host='127.0.0.1', user='root', passwd='510520', db='pachong', charset='utf8mb4')
            self.cursor = self.conn.cursor()
    
        def execute_insert_sql(self, sql, bole_data):
            self.cursor.execute(sql, bole_data)
            self.conn.commit()
    
        def __del__(self):
            self.cursor.close()
            self.conn.close()
    
    
    if __name__ == '__main__':
        bole = BoleMysql()
        insert_sql = "INSERT INTO bole_data(title, p_time, article_url) VALUES(%s, %s, %s)"
        data = ('花好月圆夜', '2020-12-18', 'https://www.baidu.com')
        bole.execute_insert_sql(insert_sql, data)

    pipelines.py

    # useful for handling different item types with a single interface
    from itemadapter import ItemAdapter
    from project_01.shujuku.bole_mysql import BoleMysql
    
    
    class BolePipeline:
        def __init__(self):
            self.bole_mysql = BoleMysql()
    
        def process_item(self, item, spider):
            title = item['title']
            p_time = item['p_time']
            article_url = item['article_url']
            insert_sql = "INSERT INTO bole_data(title, p_time, article_url) VALUES(%s, %s, %s)"
            data = (title, p_time, article_url)
            self.bole_mysql.execute_insert_sql(insert_sql, data)
            return item

    run_jobbole.py

    from scrapy.cmdline import execute
    
    # execute(['scrapy', 'crawl', 'jobbole'])
    
    execute('scrapy crawl jobbole'.split())
  • 相关阅读:
    Win10 UWP Tile Generator
    Win10 BackgroundTask
    UWP Tiles
    UWP Ad
    Win10 build package error collections
    Win10 八步打通 Nuget 发布打包
    Win10 UI入门 pivot multiable DataTemplate
    Win10 UI入门 导航滑动条 求UWP工作
    UWP Control Toolkit Collections 求UWP工作
    Win10 UI入门 SliderRectangle
  • 原文地址:https://www.cnblogs.com/glz666/p/14190065.html
Copyright © 2011-2022 走看看