zoukankan      html  css  js  c++  java
  • pyspider解析

    https://www.cnblogs.com/microman/p/6111711.html

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2017-12-07 13:40:43
    # Project: adquan
    
    from pyspider.libs.base_handler import *
    
    
    class Handler(BaseHandler):
        crawl_config = {
        }
        
        def __init__(self):
            self.deal = Deal()
    
        @every(minutes=24 * 60)
        def on_start(self):
            self.crawl('http://creative.adquan.com/show/42759', callback=self.detail_page)
    
        @config(age=10 * 24 * 60 * 60)
        def index_page(self, response):
            for each in response.doc('a[href^="http"]').items():
                self.crawl(each.attr.href, callback=self.detail_page)
    
        @config(priority=2)
        def detail_page(self, response):
            name = 'test'
            count = 0
            for img in response.doc('.con_Text img').items():
                url = img.attr.src
                if url:
                    dir_path = self.deal.mkDir(name)
                    extension = self.deal.getExtension(url)
                    file_name = str(count) + '.' + extension
                    count += 1
                    self.crawl(img.attr.src, callback=self.save_img, save={'dir_path': dir_path, 'file_name':file_name})
            return {
                "url": response.url,
                "title": response.doc('title').text(),
            }
        def save_img(self, response):
            content = response.content
            dir_path = response.save['dir_path']
            file_name = response.save['file_name']
            file_path = dir_path + '/' + file_name
            self.deal.saveImg(content, file_path)
    
        
    import os
    
    DIR_PATH = "E:/pyspider/"
    
    class Deal:
        def __init__(self):
            self.path = DIR_PATH
            if not self.path.endswith('/'):
                self.path = self.path + '/'
            if not os.path.exists(self.path):
                os.makedirs(self.path)
    
        def mkDir(self, path):
            path = path.strip()
            dir_path = self.path + path
            exists = os.path.exists(dir_path)
            if not exists:
                os.makedirs(dir_path)
                return dir_path
            else:
                return dir_path
    
        def saveImg(self, content, path):
            f = open(path, 'wb')
            f.write(content)
            f.close()
    
        def saveBrief(self, content, dir_path, name):
            file_name = dir_path + "/" + name + ".txt"
            f = open(file_name, "w+")
            f.write(content.encode('utf-8'))
    
        def getExtension(self, url):
            extension = url.split('.')[-1]
            return extension
    

      http://demo.pyspider.org/

  • 相关阅读:
    There is an overlap in the region chain修复
    There is an overlap in the region chain
    region xx not deployed on any region server
    python 中的re模块,正则表达式
    TCP粘包问题解析与解决
    yield from
    Git push提交时报错Permission denied(publickey)...Please make sure you have the correct access rights and the repository exists.
    mysql 中Varchar 与char的区别
    Mysql 字符集及排序规则
    请实现一个装饰器,限制该函数被调用的频率,如10秒一次
  • 原文地址:https://www.cnblogs.com/jiangjing/p/8001321.html
Copyright © 2011-2022 走看看