zoukankan      html  css  js  c++  java
  • pyspider解析

    https://www.cnblogs.com/microman/p/6111711.html

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2017-12-07 13:40:43
    # Project: adquan
    
    from pyspider.libs.base_handler import *
    
    
    class Handler(BaseHandler):
        crawl_config = {
        }
        
        def __init__(self):
            self.deal = Deal()
    
        @every(minutes=24 * 60)
        def on_start(self):
            self.crawl('http://creative.adquan.com/show/42759', callback=self.detail_page)
    
        @config(age=10 * 24 * 60 * 60)
        def index_page(self, response):
            for each in response.doc('a[href^="http"]').items():
                self.crawl(each.attr.href, callback=self.detail_page)
    
        @config(priority=2)
        def detail_page(self, response):
            name = 'test'
            count = 0
            for img in response.doc('.con_Text img').items():
                url = img.attr.src
                if url:
                    dir_path = self.deal.mkDir(name)
                    extension = self.deal.getExtension(url)
                    file_name = str(count) + '.' + extension
                    count += 1
                    self.crawl(img.attr.src, callback=self.save_img, save={'dir_path': dir_path, 'file_name':file_name})
            return {
                "url": response.url,
                "title": response.doc('title').text(),
            }
        def save_img(self, response):
            content = response.content
            dir_path = response.save['dir_path']
            file_name = response.save['file_name']
            file_path = dir_path + '/' + file_name
            self.deal.saveImg(content, file_path)
    
        
    import os
    
    DIR_PATH = "E:/pyspider/"
    
    class Deal:
        def __init__(self):
            self.path = DIR_PATH
            if not self.path.endswith('/'):
                self.path = self.path + '/'
            if not os.path.exists(self.path):
                os.makedirs(self.path)
    
        def mkDir(self, path):
            path = path.strip()
            dir_path = self.path + path
            exists = os.path.exists(dir_path)
            if not exists:
                os.makedirs(dir_path)
                return dir_path
            else:
                return dir_path
    
        def saveImg(self, content, path):
            f = open(path, 'wb')
            f.write(content)
            f.close()
    
        def saveBrief(self, content, dir_path, name):
            file_name = dir_path + "/" + name + ".txt"
            f = open(file_name, "w+")
            f.write(content.encode('utf-8'))
    
        def getExtension(self, url):
            extension = url.split('.')[-1]
            return extension
    

      http://demo.pyspider.org/

  • 相关阅读:
    Webbrowser中模拟连接点击(非鼠标模拟)
    用DDE控制Word
    禁止用键盘左右箭头,去切换PageControl页签
    Delphi实现全局鼠标钩子
    Delphi实现软件中登录用户的操作权限
    根据数据库结构生成TreeView
    根据字符串找到函数并执行
    用DLL实现插件的简单演示
    Delphi:窗体的扩展样式GWL_EXSTYLE用于SetWindowLong
    FastReport问题整理(http://129.sqdj.gov.cn/?p=77)
  • 原文地址:https://www.cnblogs.com/jiangjing/p/8001321.html
Copyright © 2011-2022 走看看