zoukankan      html  css  js  c++  java
  • pyspider—爬取视频链接

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2015-03-20 09:46:20
    # Project: fly_spider
    
    import re
    import time
    #from pyspider.database.mysql.mysqldb import SQL
    from pyspider.libs.base_handler import *
    from pyquery import PyQuery as pq
    
    class Handler(BaseHandler):
    
        headers= {
        "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding":"gzip, deflate, sdch",
        "Accept-Language":"zh-CN,zh;q=0.8",
        "Cache-Control":"max-age=0",
        "Connection":"keep-alive",
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"
        }
    
        crawl_config = {
            "headers" : headers,
            "timeout" : 100
        }
        @every(minutes= 1)
        def on_start(self):
            self.crawl('http://www.zhanqi.tv/games',callback=self.index_page)
    
        @config(age=10 * 24 * 60 * 60)
        def index_page(self, response):
            print(response)
            for each in response.doc('a[href^="http://www.zhanqi.tv/games/"]').items():
                if re.match("http://www.zhanqi.tv/games/w+", each.attr.href, re.U):
                    self.crawl(each.attr.href, 
                    fetch_type='js',                   
                    js_script="""
                    function() {
                        setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000);         
                    }
                    """,callback=self.list_page)  
    
        @config(age=1*60*60, priority=2)                
        def list_page(self, response):
            for each in response.doc('.active > div.live-list-tabc > ul#hotList.clearfix > li > a').items():
                if re.match("http://www.zhanqi.tv/w+", each.attr.href, re.U):
                    self.crawl(each.attr.href,
                    fetch_type='js',                   
                    js_script="""
                    function() {
                        setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000);        
                    }
                    """,callback=self.detail_page)    
    
        @config(age=1*60*60, priority=2)
        def detail_page(self, response):
            for each in response.doc('.video-flash-cont').items():
                d = pq(each)
                print(d.html())
    
            return {
                "url": response.url,
                "author":response.doc('.meat > span').text(),
                "title":response.doc('.title-name').text(),
                "game-name":response.doc('span > .game-name').text(),
                "users2":response.doc('div.live-anchor-info.clearfix > div.sub-anchor-info > div.clearfix > div.meat-info > span.num.dv.js-onlines-panel > span.dv.js-onlines-txt > span').text(),
                "flash-cont":d.html(),
                "picture":response.doc('.active > img').text(),
                }
  • 相关阅读:
    Best Time to Buy and Sell Stock III
    Valid Palindrome
    Longest Substring Without Repeating Characters
    Copy List with Random Pointer
    Add Two Numbers
    Recover Binary Search Tree
    Anagrams
    ZigZag Conversion
    Merge k Sorted Lists
    Distinct Subsequences
  • 原文地址:https://www.cnblogs.com/panliu/p/4849217.html
Copyright © 2011-2022 走看看