zoukankan html css js c++ java

bug-bug-bug

#-*-coding:utf-8-*-
import urllib
import urllib2
import re
import json
import threading
import requests
from lxml import etree
from time import sleep,ctime
from Queue import Queue
import lxml
from bs4 import BeautifulSoup
from HTMLParser import HTMLParser
from itertools import product

class Get_Html_Pthread(threading.Thread):
    def __init__(self,threadid,que):
        threading.Thread.__init__(self)
        self.threadid = threadid
        self.que = que
    def run(self):
        self.gethtml()

    def gethtml(self):
        while True:
            if self.que.empty():
                break
            else:
                page = self.que.get()
                print 'qiushibaike spider No'+ str(self.threadid) + 'page = '+ str(page)
                url = 'https://www.qiushibaike.com/hot/page/'+str(page)+ '/'
                print url
                headers = {
                    'User_agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
                    'Accept-Language': 'zh-CN,zh;q=0.8',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                    'If-None-Match': '56abab9faecd14ce1ba95132d661a82db1466c94'}
                num_try = 4
                while num_try > 0:
                    num_try -= 1
                    try:
                        content = requests.get(url, headers=headers)
                        data_que.put(content.text)
                        break
                    except Exception, e:
                        print 'qiushi_spider', e
                if num_try > 0:
                    print 'timeout:' + url


class Get_Message_Pthread(threading.Thread):
    def __init__(self,threadid,que,lock,f):
        threading.Thread.__init__(self)
        self.threadid = threadid
        self.lock = lock
        self.que = que
        self.f = f
    def run(self):
        global total,exitFlag_Parser
        while exitFlag_Parser == False:
            try:
                html = self.que.get(False)
                if not html:
                    pass
                self.getmessage(html)
                self.que.task_done()
            except:
                pass

    def getmessage(self,html1):
        global total
        try:
            html = etree.HTML(html1)
            result = html.xpath('//div[contains(@id,"qiushi_tag")]')
            for each in result:
                comment_res = each.xpath('.//span')[0].text
                name = each.xpath('.//h2')[0].text
                resultq = {
                    'author':name,
                    'phrase':comment_res,
                }
                print resultq
                with self.lock:
                    self.f.write(json.dumps(resultq, ensure_ascii=False).encode('utf-8') + "
")

        except Exception,e:
            print 'paeser_data',e

        with self.lock:
            total += 1

data_que = Queue()
lock = threading.Lock()
exitFlag_Parser = False
total = 0
def main():
    output = open('Phrase.json', 'a')
    pageque = Queue(60)
    for page in range(1,11):
        pageque.put(page)
    gethtmlpthread = []
    List = [0,1,2,3,4,5]
    for threadid in range(5):
        thread = Get_Html_Pthread(threadid,pageque)
        thread.start()
        gethtmlpthread.append(thread)

    getmessagepthread = []


    for threadid in range(5):
        thread = Get_Message_Pthread(threadid,data_que,lock,output)
        thread.start()
        getmessagepthread.append(thread)

    while not pageque.empty():
        pass

    for t in  gethtmlpthread:
        t.join()

    while not data_que.empty():
        pass

    for t in gethtmlpthread:
        t.join()
    with lock:
        output.close()

if __name__ == '__main__':
    global total
    main()
    print 'total'+ str(total)

查看全文

相关阅读:
链表基础及常见面试题
 浅谈一个网页打开的全过程（涉及DNS、CDN、Nginx负载均衡等）
PHP函数高级（二）
sql注入笔记
 PHPStorm2017去掉参数提示 parameter name hints
CDN与智能DNS原理和应用
 用户黏性与垂直社区，互联网营销狼人:
我在赶集网的两个月（完整版），互联网营销狼人:
微博变种与RSS变种，互联网营销狼人:
从 Reddit 学到的经验，互联网营销狼人:

原文地址：https://www.cnblogs.com/chenyang920/p/7663024.html