zoukankan      html  css  js  c++  java
  • bug-bug-bug

    #-*-coding:utf-8-*-
    import urllib
    import urllib2
    import re
    import json
    import threading
    import requests
    from lxml import etree
    from time import sleep,ctime
    from Queue import Queue
    import lxml
    from bs4 import BeautifulSoup
    from HTMLParser import HTMLParser
    from itertools import product
    
    class Get_Html_Pthread(threading.Thread):
        def __init__(self,threadid,que):
            threading.Thread.__init__(self)
            self.threadid = threadid
            self.que = que
        def run(self):
            self.gethtml()
    
        def gethtml(self):
            while True:
                if self.que.empty():
                    break
                else:
                    page = self.que.get()
                    print 'qiushibaike spider No'+ str(self.threadid) + 'page = '+ str(page)
                    url = 'https://www.qiushibaike.com/hot/page/'+str(page)+ '/'
                    print url
                    headers = {
                        'User_agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
                        'Accept-Language': 'zh-CN,zh;q=0.8',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                        'If-None-Match': '56abab9faecd14ce1ba95132d661a82db1466c94'}
                    num_try = 4
                    while num_try > 0:
                        num_try -= 1
                        try:
                            content = requests.get(url, headers=headers)
                            data_que.put(content.text)
                            break
                        except Exception, e:
                            print 'qiushi_spider', e
                    if num_try > 0:
                        print 'timeout:' + url
    
    
    class Get_Message_Pthread(threading.Thread):
        def __init__(self,threadid,que,lock,f):
            threading.Thread.__init__(self)
            self.threadid = threadid
            self.lock = lock
            self.que = que
            self.f = f
        def run(self):
            global total,exitFlag_Parser
            while exitFlag_Parser == False:
                try:
                    html = self.que.get(False)
                    if not html:
                        pass
                    self.getmessage(html)
                    self.que.task_done()
                except:
                    pass
    
        def getmessage(self,html1):
            global total
            try:
                html = etree.HTML(html1)
                result = html.xpath('//div[contains(@id,"qiushi_tag")]')
                for each in result:
                    comment_res = each.xpath('.//span')[0].text
                    name = each.xpath('.//h2')[0].text
                    resultq = {
                        'author':name,
                        'phrase':comment_res,
                    }
                    print resultq
                    with self.lock:
                        self.f.write(json.dumps(resultq, ensure_ascii=False).encode('utf-8') + "
    ")
    
            except Exception,e:
                print 'paeser_data',e
    
            with self.lock:
                total += 1
    
    data_que = Queue()
    lock = threading.Lock()
    exitFlag_Parser = False
    total = 0
    def main():
        output = open('Phrase.json', 'a')
        pageque = Queue(60)
        for page in range(1,11):
            pageque.put(page)
        gethtmlpthread = []
        List = [0,1,2,3,4,5]
        for threadid in range(5):
            thread = Get_Html_Pthread(threadid,pageque)
            thread.start()
            gethtmlpthread.append(thread)
    
        getmessagepthread = []
    
    
        for threadid in range(5):
            thread = Get_Message_Pthread(threadid,data_que,lock,output)
            thread.start()
            getmessagepthread.append(thread)
    
        while not pageque.empty():
            pass
    
        for t in  gethtmlpthread:
            t.join()
    
        while not data_que.empty():
            pass
    
        for t in gethtmlpthread:
            t.join()
        with lock:
            output.close()
    
    if __name__ == '__main__':
        global total
        main()
        print 'total'+ str(total)
  • 相关阅读:
    Python——字符串、文件操作,英文词频统计预处理
    了解大数据的特点,来源与数据的呈现方式
    hadoop综合大作业
    分布式并行计算MapReduce
    ·分布式文件系统HDFS 练习
    安装关系型数据库MySQL 安装大数据处理框架Hadoop
    爬虫综合大作业
    爬取全部的校园新闻
    获取一篇新闻的全部信息
    理解爬虫原理
  • 原文地址:https://www.cnblogs.com/chenyang920/p/7663024.html
Copyright © 2011-2022 走看看