zoukankan      html  css  js  c++  java
  • bug-bug-bug

    #-*-coding:utf-8-*-
    import urllib
    import urllib2
    import re
    import json
    import threading
    import requests
    from lxml import etree
    from time import sleep,ctime
    from Queue import Queue
    import lxml
    from bs4 import BeautifulSoup
    from HTMLParser import HTMLParser
    from itertools import product
    
    class Get_Html_Pthread(threading.Thread):
        def __init__(self,threadid,que):
            threading.Thread.__init__(self)
            self.threadid = threadid
            self.que = que
        def run(self):
            self.gethtml()
    
        def gethtml(self):
            while True:
                if self.que.empty():
                    break
                else:
                    page = self.que.get()
                    print 'qiushibaike spider No'+ str(self.threadid) + 'page = '+ str(page)
                    url = 'https://www.qiushibaike.com/hot/page/'+str(page)+ '/'
                    print url
                    headers = {
                        'User_agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
                        'Accept-Language': 'zh-CN,zh;q=0.8',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                        'If-None-Match': '56abab9faecd14ce1ba95132d661a82db1466c94'}
                    num_try = 4
                    while num_try > 0:
                        num_try -= 1
                        try:
                            content = requests.get(url, headers=headers)
                            data_que.put(content.text)
                            break
                        except Exception, e:
                            print 'qiushi_spider', e
                    if num_try > 0:
                        print 'timeout:' + url
    
    
    class Get_Message_Pthread(threading.Thread):
        def __init__(self,threadid,que,lock,f):
            threading.Thread.__init__(self)
            self.threadid = threadid
            self.lock = lock
            self.que = que
            self.f = f
        def run(self):
            global total,exitFlag_Parser
            while exitFlag_Parser == False:
                try:
                    html = self.que.get(False)
                    if not html:
                        pass
                    self.getmessage(html)
                    self.que.task_done()
                except:
                    pass
    
        def getmessage(self,html1):
            global total
            try:
                html = etree.HTML(html1)
                result = html.xpath('//div[contains(@id,"qiushi_tag")]')
                for each in result:
                    comment_res = each.xpath('.//span')[0].text
                    name = each.xpath('.//h2')[0].text
                    resultq = {
                        'author':name,
                        'phrase':comment_res,
                    }
                    print resultq
                    with self.lock:
                        self.f.write(json.dumps(resultq, ensure_ascii=False).encode('utf-8') + "
    ")
    
            except Exception,e:
                print 'paeser_data',e
    
            with self.lock:
                total += 1
    
    data_que = Queue()
    lock = threading.Lock()
    exitFlag_Parser = False
    total = 0
    def main():
        output = open('Phrase.json', 'a')
        pageque = Queue(60)
        for page in range(1,11):
            pageque.put(page)
        gethtmlpthread = []
        List = [0,1,2,3,4,5]
        for threadid in range(5):
            thread = Get_Html_Pthread(threadid,pageque)
            thread.start()
            gethtmlpthread.append(thread)
    
        getmessagepthread = []
    
    
        for threadid in range(5):
            thread = Get_Message_Pthread(threadid,data_que,lock,output)
            thread.start()
            getmessagepthread.append(thread)
    
        while not pageque.empty():
            pass
    
        for t in  gethtmlpthread:
            t.join()
    
        while not data_que.empty():
            pass
    
        for t in gethtmlpthread:
            t.join()
        with lock:
            output.close()
    
    if __name__ == '__main__':
        global total
        main()
        print 'total'+ str(total)
  • 相关阅读:
    Max History CodeForces
    Buy a Ticket CodeForces
    AC日记——字符串的展开 openjudge 1.7 35
    AC日记——回文子串 openjudge 1.7 34
    AC日记——判断字符串是否为回文 openjudge 1.7 33
    AC日记——行程长度编码 openjudge 1.7 32
    AC日记——字符串P型编码 openjudge 1.7 31
    AC日记——字符环 openjudge 1.7 30
    AC日记——ISBN号码 openjudge 1.7 29
    AC日记——单词倒排 1.7 28
  • 原文地址:https://www.cnblogs.com/chenyang920/p/7663024.html
Copyright © 2011-2022 走看看