zoukankan      html  css  js  c++  java
  • bug-bug-bug

    #-*-coding:utf-8-*-
    import urllib
    import urllib2
    import re
    import json
    import threading
    import requests
    from lxml import etree
    from time import sleep,ctime
    from Queue import Queue
    import lxml
    from bs4 import BeautifulSoup
    from HTMLParser import HTMLParser
    from itertools import product
    
    class Get_Html_Pthread(threading.Thread):
        def __init__(self,threadid,que):
            threading.Thread.__init__(self)
            self.threadid = threadid
            self.que = que
        def run(self):
            self.gethtml()
    
        def gethtml(self):
            while True:
                if self.que.empty():
                    break
                else:
                    page = self.que.get()
                    print 'qiushibaike spider No'+ str(self.threadid) + 'page = '+ str(page)
                    url = 'https://www.qiushibaike.com/hot/page/'+str(page)+ '/'
                    print url
                    headers = {
                        'User_agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
                        'Accept-Language': 'zh-CN,zh;q=0.8',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                        'If-None-Match': '56abab9faecd14ce1ba95132d661a82db1466c94'}
                    num_try = 4
                    while num_try > 0:
                        num_try -= 1
                        try:
                            content = requests.get(url, headers=headers)
                            data_que.put(content.text)
                            break
                        except Exception, e:
                            print 'qiushi_spider', e
                    if num_try > 0:
                        print 'timeout:' + url
    
    
    class Get_Message_Pthread(threading.Thread):
        def __init__(self,threadid,que,lock,f):
            threading.Thread.__init__(self)
            self.threadid = threadid
            self.lock = lock
            self.que = que
            self.f = f
        def run(self):
            global total,exitFlag_Parser
            while exitFlag_Parser == False:
                try:
                    html = self.que.get(False)
                    if not html:
                        pass
                    self.getmessage(html)
                    self.que.task_done()
                except:
                    pass
    
        def getmessage(self,html1):
            global total
            try:
                html = etree.HTML(html1)
                result = html.xpath('//div[contains(@id,"qiushi_tag")]')
                for each in result:
                    comment_res = each.xpath('.//span')[0].text
                    name = each.xpath('.//h2')[0].text
                    resultq = {
                        'author':name,
                        'phrase':comment_res,
                    }
                    print resultq
                    with self.lock:
                        self.f.write(json.dumps(resultq, ensure_ascii=False).encode('utf-8') + "
    ")
    
            except Exception,e:
                print 'paeser_data',e
    
            with self.lock:
                total += 1
    
    data_que = Queue()
    lock = threading.Lock()
    exitFlag_Parser = False
    total = 0
    def main():
        output = open('Phrase.json', 'a')
        pageque = Queue(60)
        for page in range(1,11):
            pageque.put(page)
        gethtmlpthread = []
        List = [0,1,2,3,4,5]
        for threadid in range(5):
            thread = Get_Html_Pthread(threadid,pageque)
            thread.start()
            gethtmlpthread.append(thread)
    
        getmessagepthread = []
    
    
        for threadid in range(5):
            thread = Get_Message_Pthread(threadid,data_que,lock,output)
            thread.start()
            getmessagepthread.append(thread)
    
        while not pageque.empty():
            pass
    
        for t in  gethtmlpthread:
            t.join()
    
        while not data_que.empty():
            pass
    
        for t in gethtmlpthread:
            t.join()
        with lock:
            output.close()
    
    if __name__ == '__main__':
        global total
        main()
        print 'total'+ str(total)
  • 相关阅读:
    Best Cow Fences_二分&&DP
    Palindrome_滚动数组&&DP
    Making the Grade_滚动数组&&dp
    BUY LOW, BUY LOWER_最长下降子序列
    Testing the CATCHER_DP
    Best Sequence_DFS&&KMp
    (Your)((Term)((Project)))
    [SOJ] 畅通工程续
    [SOJ] 商人的宣传
    [SOJ] 无路可逃?
  • 原文地址:https://www.cnblogs.com/chenyang920/p/7663024.html
Copyright © 2011-2022 走看看