zoukankan      html  css  js  c++  java
  • 多线程网页爬虫 python 实现(二)

    #!/usr/bin/env python
    #coding=utf-8
    import threading
    import urllib
    import re
    import time
    
    cur=0
    last=0
    totalcount=0
    depth=0
    t_mutex=threading.Condition() 
    
    class Mycrawler:
        def __init__(self,crawlername,seeds,threadnum):
            self.crawlername=crawlername
            self.seeds=seeds
            self.crawqueue=CrawQueue()
            self.initQueue(self.seeds)
            self.threadnum=threadnum
            self.threadpools=[]
            self.logfile=file('log2.txt','w')
        def initQueue(self,seeds):
            if isinstance(seeds,str):
                self.crawqueue.push(seeds)
            elif isinstance(seeds,list):
                for seed in seeds:
                    self.crawqueue.push(seed)
            global last
            global totalcount
            totalcount=self.crawqueue.getQueueCount()
            last=totalcount
        def crawling(self):
            global cur
            global depth
            global last
            global totalcount
            self.log(">>>Depth "+str(depth)+":
    ")
            while self.crawqueue.getQueueCount()!=0:
                url=self.crawqueue.pop()
                self.log(url)
                if url==None:
                    continue
                self.crawqueue.addToVisited(url)
                links=self.getLinks(url)
                if links==None:
                    print 'None'
                    self.crawqueue.failed.append(url)
                    continue
                beforenum = self.crawqueue.getQueueCount()
                self.crawqueue.addLinks(links)
                afternum  = self.crawqueue.getQueueCount()
                totalcount+=afternum-beforenum
                cur+=1
                if cur==last:
                    depth+=1
                    self.log(">>>Depth "+str(depth)+":
    ")
                    last=totalcount
        def crawling2(self):
            global last
            global totalcount
            global depth
            self.log(">>>Depth "+str(depth)+":
    ")
            totalcount=self.crawqueue.getQueueCount()
            last=totalcount
            while self.crawqueue.getQueueCount()!=0:
                for i in range(self.threadnum):
                    url=self.crawqueue.pop()
                    if url==None:
                        break
                    crawthread=crawlerThread(url,i,self)
                    self.threadpools.append(crawthread)
                    crawthread.start()
                for i in range(len(self.threadpools)):
                    crawthread=self.threadpools[i]
                    crawthread.join(30)        
        def log(self,content):
            self.logfile.write(content+"
    ")
    class crawlerThread(threading.Thread):
        def __init__(self,url,tid,mycrawler):
            threading.Thread.__init__(self)
            self.url=url
            self.tid=tid
            self.mycrawler=mycrawler
        def run(self):
            global t_mutex
            global cur
            global last
            global totalcount
            global depth
            t_mutex.acquire()
            self.mycrawler.log(self.url)
            t_mutex.release()
            links=self.getLinks(self.url)
            if links==None:
                t_mutex.acquire()
                self.mycrawler.crawqueue.addToVisited(self.url)
                self.mycrawler.crawqueue.addToFailed(self.url)
                t_mutex.release()
            else:
                t_mutex.acquire()
                self.mycrawler.crawqueue.addToVisited(self.url)
                beforenum=self.mycrawler.crawqueue.getQueueCount()
                self.mycrawler.crawqueue.addLinks(links)
                afternum =self.mycrawler.crawqueue.getQueueCount()
                totalcount+=afternum-beforenum
                t_mutex.release()
            t_mutex.acquire()
            cur+=1
            if cur==last:
                depth+=1
                self.mycrawler.log(">>>Depth "+str(depth)+":
    ")
                last=totalcount
            t_mutex.release()
        def getLinks(self,url):
            try:
                page=urllib.urlopen(url)    
                html=page.read()
                reg=r'"(http://.+?)"'
                regob=re.compile(reg,re.DOTALL)
                links=regob.findall(html)
                return links
            except:
                print 'Failed downloading and saving',url
                return None
    class CrawQueue:
        def __init__(self):
            self.queue=[]
            self.visited=[]
            self.failed=[]
        def getQueue(self):
            return self.queue
        def getVisited(self):
            return self.visited
        def getFailed(self):
            return self.failed
        def push(self,url):
            if url!="" and url not in self.queue and url not in self.visited: 
                self.queue.insert(0,url)
        def pop(self):
            if len(self.queue)==0:
                #print 'failed to pop: queue is empty'
                return None
            else:
                return self.queue.pop()
        def isEmpty(self):
            if len(self.queue)==0:
                return 1
            else:
                return 0
        def addToVisited(self,url):
            self.visited.append(url)
        def addToFailed(self,url):
            self.failed.append(url)
        def remove(self,url):
            self.queue.remove(url)
        def getVisitedCount(self):
            return len(self.visited)
        def getQueueCount(self):
            return len(self.queue)
        def addLinks(self,links):
            for link in links:
                self.push(link)
    
    if __name__=="__main__":
        seeds="http://www.douban.com/"
        threadnum=int(raw_input("设置线程数:"))
        crawlername="小小爬虫"
        mycrawler=Mycrawler(crawlername,seeds,threadnum)
        mycrawler.crawling2()
    
                
  • 相关阅读:
    日月
    硕人
    式微
    芣苡
    樛木
    兔罝
    绿衣
    汉广
    小星
    惠子相梁
  • 原文地址:https://www.cnblogs.com/yanglf/p/4025297.html
Copyright © 2011-2022 走看看