zoukankan      html  css  js  c++  java
  • bug-bug-bug

    #-*-coding:utf-8-*-
    import urllib
    import urllib2
    import re
    import json
    import threading
    import requests
    from lxml import etree
    from time import sleep,ctime
    from Queue import Queue
    import lxml
    from bs4 import BeautifulSoup
    from HTMLParser import HTMLParser
    from itertools import product
    
    class Get_Html_Pthread(threading.Thread):
        def __init__(self,threadid,que):
            threading.Thread.__init__(self)
            self.threadid = threadid
            self.que = que
        def run(self):
            self.gethtml()
    
        def gethtml(self):
            while True:
                if self.que.empty():
                    break
                else:
                    page = self.que.get()
                    print 'qiushibaike spider No'+ str(self.threadid) + 'page = '+ str(page)
                    url = 'https://www.qiushibaike.com/hot/page/'+str(page)+ '/'
                    print url
                    headers = {
                        'User_agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
                        'Accept-Language': 'zh-CN,zh;q=0.8',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                        'If-None-Match': '56abab9faecd14ce1ba95132d661a82db1466c94'}
                    num_try = 4
                    while num_try > 0:
                        num_try -= 1
                        try:
                            content = requests.get(url, headers=headers)
                            data_que.put(content.text)
                            break
                        except Exception, e:
                            print 'qiushi_spider', e
                    if num_try > 0:
                        print 'timeout:' + url
    
    
    class Get_Message_Pthread(threading.Thread):
        def __init__(self,threadid,que,lock,f):
            threading.Thread.__init__(self)
            self.threadid = threadid
            self.lock = lock
            self.que = que
            self.f = f
        def run(self):
            global total,exitFlag_Parser
            while exitFlag_Parser == False:
                try:
                    html = self.que.get(False)
                    if not html:
                        pass
                    self.getmessage(html)
                    self.que.task_done()
                except:
                    pass
    
        def getmessage(self,html1):
            global total
            try:
                html = etree.HTML(html1)
                result = html.xpath('//div[contains(@id,"qiushi_tag")]')
                for each in result:
                    comment_res = each.xpath('.//span')[0].text
                    name = each.xpath('.//h2')[0].text
                    resultq = {
                        'author':name,
                        'phrase':comment_res,
                    }
                    print resultq
                    with self.lock:
                        self.f.write(json.dumps(resultq, ensure_ascii=False).encode('utf-8') + "
    ")
    
            except Exception,e:
                print 'paeser_data',e
    
            with self.lock:
                total += 1
    
    data_que = Queue()
    lock = threading.Lock()
    exitFlag_Parser = False
    total = 0
    def main():
        output = open('Phrase.json', 'a')
        pageque = Queue(60)
        for page in range(1,11):
            pageque.put(page)
        gethtmlpthread = []
        List = [0,1,2,3,4,5]
        for threadid in range(5):
            thread = Get_Html_Pthread(threadid,pageque)
            thread.start()
            gethtmlpthread.append(thread)
    
        getmessagepthread = []
    
    
        for threadid in range(5):
            thread = Get_Message_Pthread(threadid,data_que,lock,output)
            thread.start()
            getmessagepthread.append(thread)
    
        while not pageque.empty():
            pass
    
        for t in  gethtmlpthread:
            t.join()
    
        while not data_que.empty():
            pass
    
        for t in gethtmlpthread:
            t.join()
        with lock:
            output.close()
    
    if __name__ == '__main__':
        global total
        main()
        print 'total'+ str(total)
  • 相关阅读:
    Java核心技术Java程序设计
    Mac下查看 Java 安装目录位置和安装数量
    Intellij IDEA快捷键与使用小技巧
    Java 8 新特性 用 Collectors 对 List 去重
    onInterceptTouchEvent()与onTouchEvent()的机制
    Android 开发之多线程处理、Handler
    安卓中使用XmlPullParser解析xml文件
    监控部署nagios+snmp
    阿里RDS数据库 全量备份恢复到本地MYSQL
    20120412
  • 原文地址:https://www.cnblogs.com/chenyang920/p/7663024.html
Copyright © 2011-2022 走看看