zoukankan      html  css  js  c++  java
  • python爬虫总结

    安装Scrapy(有很多依赖库要装,略麻烦)

    参考: https://www.cnblogs.com/liuliliuli2017/p/6746440.html

    Scrapy中文文档: http://scrapy-chs.readthedocs.io/zh_CN/0.24/index.html

    查看scrapy基本信息和功能

    scrapy

      测试爬虫性能

    scrapy bench 

    爬取网页信息(以百度首页为例)

    scrapy fetch "http://www.baidu.com"

    shell环境,可以在cmd进行操作(以百度为例)

    scrapy shell "http://www.baidu.com"
    print response.body # 打印响应主体

    创建项目(以ITcast为例)

    scrapy startproject ITcast

    settings.py屏蔽ROBOTSTXT_OBEY(不遵守机器人协议)

    生成爬虫文件

    # scrapy genspider example example_url
    scrapy genspider itcast "http://www.itcast.cn"

    items字段(items.py)

    import scrapy
    
    
    class ItcastItem(scrapy.Item):
        # define the fields for your item here like:
        
        #老师姓名
        name = scrapy.Field()
        #老师职称
        title = scrapy.Field()
        #老师信息
        info = scrapy.Field()

    编写爬虫文件(itcast.py)

    # -*- coding: utf-8 -*-
    
    import scrapy
    from ITcast.items import ItcastItem
    
    class ItcastSpider(scrapy.Spider):
        #爬虫名(必选)
        name = 'itcast'
        allowed_domains = ['http://www.itcast.cn']
        start_urls = ['http://www.itcast.cn/channel/teacher.shtml']
    
        def parse(self, response): 
            node_list = response.xpath("//div[@class='li_txt']")
            #存储所有的item字段
            items = []
            for node in node_list:
                item = ItcastItem()
                name = node.xpath("./h3/text()").extract()
                title = node.xpath("./h4/text()").extract()
                info = node.xpath("./p/text()").extract()
                
                item['name'] = name[0]
                item['title'] = title[0]
                item['info'] = info[0]
                
                items.append(item)
            return items
            #pass

    检查爬虫是否无误

    scrapy check itcast

    运行爬虫

    scrapy crawl itcast

    查看爬虫

    scrapy list

    编写多个管道,则需要在settints文件中的ITEM_PIPELINES添加

    例: 腾讯招聘(多页抓取)

    items.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class TencentItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        #职位名
        positionName = scrapy.Field()
        #职位详情
        #positionLink = scrapy.Field()
        #职位类型
        #positionType = scrapy.Field()
        #人数
        #peopleNumber = scrapy.Field()
        #工作地点
        #workLocation = scrapy.Field()
        #发布时间
        #publishTime = scrapy.Field()
        #pass
    View Code

    pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import json
    class TencentPipeline(object):
        def __init__(self):
            self.f = open("tencent.json", "w")
        def process_item(self, item, spider):
            content = json.dumps(dict(item), ensure_ascii=False) +"
    "
            #self.f.write(item['positionName'] + "
    ")
            self.f.write(content)
            return item
        def close_spider(self, spider):
            self.f.close()
    View Code

    settings.py开启管道

    ITEM_PIPELINES = {
        'Tencent.pipelines.TencentPipeline': 300,
    }
    View Code

    tencent.py

    # -*- coding: utf-8 -*-
    import scrapy
    from Tencent.items import TencentItem
    
    class TencentSpider(scrapy.Spider):
        name = 'tencent'
        allowed_domains = ['tencent.com']
        base_url = "http://hr.tencent.com/position.php?&start="
        offset = 0
        
        start_urls = [base_url + str(offset)]
    
        def parse(self, response):
            node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
            for node in node_list:
                item = TencentItem()
                item['positionName'] = node.xpath("./td[1]/a/text()").extract()[0]
                #item['positionLink'] = node.xpath("./td[1]/a/@href").extract()[0].encode("utf-8")
                #item['positionType'] = node.xpath("./td[2]/text()").extract()[0].encode("utf-8")
                #item['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0].encode("utf-8")
                #item['workLocation'] = node.xpath("./td[4]/text()").extract()[0].encode("utf-8")
                #item['publishTime'] = node.xpath("./td[5]/text()").extract()[0].encode("utf-8")
                yield item
            # if self.offset < 2620 :
                # self.offset += 10
                # url = self.base_url + str(self.offset)
                # yield scrapy.Request(url, callback = self.parse)
            next_page = response.xpath("//*[@id='next']/@href").extract()[0]
            if not next_page.startswith("java") :
                yield scrapy.Request("http://hr.tencent.com/" + next_page, callback = self.parse)
                
            #pass
    View Code

    例: 斗鱼主播图片爬取(图片爬取)

    items.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class DouyuItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        nickname = scrapy.Field()
        imagelink = scrapy.Field()
        
    View Code

    pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    import os
    from Douyu.settings import IMAGES_STORE as image_store
    from scrapy.pipelines.images import ImagesPipeline
    import scrapy
    class DouyuPipeline(ImagesPipeline):
        def get_media_requests(self, item, info):
            image_link = item['imagelink']
            yield scrapy.Request(image_link)
        
        def item_completed(self, results, item, info):
            #print(results)
            image_path = [x['path'] for ok,x in results if ok]
            os.rename(image_store + image_path[0], image_store + item['nickname'] + ".jpg")
    View Code

    settings.py配置IMAGE_STORE和USER_AGENT并开启管道(同腾讯招聘)

    IMAGES_STORE = "E:/PythonScrapy/Douyu/Douyu/Images/"
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Linux; U; Android 4.4.2; zh-CN; HUAWEI MT7-TL00 Build/HuaweiMT7-TL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.3.8.909 Mobile Safari/537.36'
    View Code

    douyu.py

    # -*- coding: utf-8 -*-
    import scrapy
    import json
    from Douyu.items import DouyuItem
    class DouyuSpider(scrapy.Spider):
        name = 'douyu'
        allowed_domains = ['douyucdn.cn']
        baseURL = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
        offset = 0
        start_urls = [baseURL + str(offset)]
    
        def parse(self, response):
            
            data_list = json.loads(response.body.decode('gbk'))['data']
            if len(data_list) == 0:
                return
            #print(data_list)
            for data in data_list:
                item = DouyuItem()
                item['nickname'] = data['nickname']
                item['imagelink'] = data['vertical_src']
                yield item
                
            #self.offset += 20
            #yield scrapy.Request(self.baseURL + str(self.offset), callback = self.parse)
    View Code
  • 相关阅读:
    “访问”美术馆
    加分二叉树
    有线电视网
    二叉苹果树
    鬼子进村
    遍历问题
    最大子树和
    FBI树
    求前序遍历
    JS如何实现点击页面内任意的链接均加参数跳转?
  • 原文地址:https://www.cnblogs.com/wust-ouyangli/p/8457577.html
Copyright © 2011-2022 走看看