zoukankan html css js c++ java

pyspider示例代码七：自动登陆并获得PDF文件下载地址

自动登陆并获得PDF文件下载地址

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-03-22 22:06:55
# Project: pdf_spider

import re
from pyspider.libs.base_handler import *

class Handler(BaseHandler):
    
    global Cookie
    Cookie= {"tsclub_bb90_saltkey":"xozcC32l",
"tsclub_bb90_lastvisit":"1428457605",
"tsclub_bb90_visitedfid":"326",
"tsclub_bb90_ulastactivity":"1428579196%7C0",
"tsclub_bb90_auth":"f9f8KcrDaj3q9aY9OxESFgE2Cz%2BArVk0gZ5jv%2BQohyhctLjeopEZrXU%2FEbsF6pk%2B754%2Fsi5DnB0W%2BmsmLwMvtC3xkWLt",
"tsclub_bb90_lastcheckfeed":"5470207%7C1428579196",
"tsclub_bb90_lip":"122.13.84.73%2C1428579196",
"tsclub_bb90_nofavfid":"1",
"pgv_pvi":"8694210858",
"pgv_info":"ssi=s5025153920",
"Hm_lvt_ee0d63d2db0dfbf9e0d399bccbd5fce7":"1428461128,1428578830",
"Hm_lpvt_ee0d63d2db0dfbf9e0d399bccbd5fce7":"1428581442",
"tsclub_bb90_lastact":"1428581519%09misc.php%09patch",
"tjpctrl":"1428583242081",
             }
   
    headers= {
        "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding":"gzip;deflate;sdch",
        "Accept-Language":"zh-CN,zh;en-US;q=0.8",
        "Cache-Control":"no-cache",
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.4368.102 Safari/537.36",
        "Host":"club.topsage.com",
        "Pragma":"no-cache",
        "Refer":"http://club.topsage.com",
        "Connection":"keep-alive",
    }

    crawl_config = {
        "headers" : headers,
        "timeout" : 1000,
        "cookies" : Cookie
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://club.topsage.com/', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            if(re.match("http://club.topsage.com/forum-.+.html", each.attr.href, re.U)):
                self.crawl(each.attr.href, callback=self.forum_page)
            elif re.match("http://club.topsage.com/thread-.+.html", each.attr.href, re.U):
                self.crawl(each.attr.href, callback=self.detail_page)
    
    @config(age=10 * 24 * 60 * 60, priority=2)
    def forum_page(self, response):
        response_url=response.url
        #print('forum_page >> response url is ' + response_url)
        
        for each in response.doc('a[href^="http://club.topsage.com"]').items():
            #if each.attr.href!=response.url:
            #detail page
            if re.match("http://club.topsage.com/thread-.+.html", each.attr.href, re.U):
                self.crawl(each.attr.href, callback=self.detail_page)
            #forum forum page
            elif re.match("http://club.topsage.com/forum-.+.html", each.attr.href, re.U):
                self.crawl(each.attr.href, callback=self.forum_page)
        
        #next page
        for each in response.doc('html > body > div > div > div > div > a').items():
            self.crawl(each.attr.href, callback=self.forum_page)
            

                    
    @config(priority=2)
    def detail_page(self, response):
        response_url=response.url
        print('detail_page >> response url is ' + response_url)
        
        for each in response.doc('table tr > td > a').items():
                if(self.is_url_matched(each.attr.href)):
                    print('attachment url is ' + each.attr.href)
                    return {
                        "download_url":each.attr.href,
                        "file_name":each.text(),
                        }
                
    def is_url_matched(self, url):
        if(re.match('^(http|ftp|https)://.+.(zip|rar|tar|pdf|doc|docx|excel|ppt|pptx)$', url, re.U)):
            return True
        if(re.match('^http://club.topsage.com/forum.php?mod=attachment.+', url, re.U)):
            return True
        return False

查看全文

相关阅读:
对 Spring IoC 的理解
 初识 Spring 框架
 CSS 全局样式
 Bootstrap 12 栅格系统
 551 闭包，浏览器垃圾回收机制/内存收机制
 550 JavaScript运行机制之“堆栈”
549 数据类型转换汇总：转换为Number、字符串、布尔，比较操作==、===，练习题
 547 Promise：Ajax 的串行、并行， Promise的executor和状态，then、catch、finally，then链
 546 变量提升
 545 parseInt解析

原文地址：https://www.cnblogs.com/microman/p/6145466.html