zoukankan      html  css  js  c++  java
  • bug-bug-bug

    #-*-coding:utf-8-*-
    import urllib
    import urllib2
    import re
    import json
    import threading
    import requests
    from lxml import etree
    from time import sleep,ctime
    from Queue import Queue
    import lxml
    from bs4 import BeautifulSoup
    from HTMLParser import HTMLParser
    from itertools import product
    
    class Get_Html_Pthread(threading.Thread):
        def __init__(self,threadid,que):
            threading.Thread.__init__(self)
            self.threadid = threadid
            self.que = que
        def run(self):
            self.gethtml()
    
        def gethtml(self):
            while True:
                if self.que.empty():
                    break
                else:
                    page = self.que.get()
                    print 'qiushibaike spider No'+ str(self.threadid) + 'page = '+ str(page)
                    url = 'https://www.qiushibaike.com/hot/page/'+str(page)+ '/'
                    print url
                    headers = {
                        'User_agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
                        'Accept-Language': 'zh-CN,zh;q=0.8',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                        'If-None-Match': '56abab9faecd14ce1ba95132d661a82db1466c94'}
                    num_try = 4
                    while num_try > 0:
                        num_try -= 1
                        try:
                            content = requests.get(url, headers=headers)
                            data_que.put(content.text)
                            break
                        except Exception, e:
                            print 'qiushi_spider', e
                    if num_try > 0:
                        print 'timeout:' + url
    
    
    class Get_Message_Pthread(threading.Thread):
        def __init__(self,threadid,que,lock,f):
            threading.Thread.__init__(self)
            self.threadid = threadid
            self.lock = lock
            self.que = que
            self.f = f
        def run(self):
            global total,exitFlag_Parser
            while exitFlag_Parser == False:
                try:
                    html = self.que.get(False)
                    if not html:
                        pass
                    self.getmessage(html)
                    self.que.task_done()
                except:
                    pass
    
        def getmessage(self,html1):
            global total
            try:
                html = etree.HTML(html1)
                result = html.xpath('//div[contains(@id,"qiushi_tag")]')
                for each in result:
                    comment_res = each.xpath('.//span')[0].text
                    name = each.xpath('.//h2')[0].text
                    resultq = {
                        'author':name,
                        'phrase':comment_res,
                    }
                    print resultq
                    with self.lock:
                        self.f.write(json.dumps(resultq, ensure_ascii=False).encode('utf-8') + "
    ")
    
            except Exception,e:
                print 'paeser_data',e
    
            with self.lock:
                total += 1
    
    data_que = Queue()
    lock = threading.Lock()
    exitFlag_Parser = False
    total = 0
    def main():
        output = open('Phrase.json', 'a')
        pageque = Queue(60)
        for page in range(1,11):
            pageque.put(page)
        gethtmlpthread = []
        List = [0,1,2,3,4,5]
        for threadid in range(5):
            thread = Get_Html_Pthread(threadid,pageque)
            thread.start()
            gethtmlpthread.append(thread)
    
        getmessagepthread = []
    
    
        for threadid in range(5):
            thread = Get_Message_Pthread(threadid,data_que,lock,output)
            thread.start()
            getmessagepthread.append(thread)
    
        while not pageque.empty():
            pass
    
        for t in  gethtmlpthread:
            t.join()
    
        while not data_que.empty():
            pass
    
        for t in gethtmlpthread:
            t.join()
        with lock:
            output.close()
    
    if __name__ == '__main__':
        global total
        main()
        print 'total'+ str(total)
  • 相关阅读:
    LeetCode 35 搜索插入位置
    LeetCode 69 x 的平方根
    LeetCode 61 旋转链表
    LeetCode 876 链表的中间结点
    LeetCode 142 环形链表 II
    LeetCode 206 反转链表
    LeetCode 237 删除链表中的节点
    LeetCode 83 删除排序链表中的重复元素
    元素的隐藏与显示与判断 js jquery aspx.cs
    判断对象是否为空 js与Jquery区别
  • 原文地址:https://www.cnblogs.com/chenyang920/p/7663024.html
Copyright © 2011-2022 走看看