zoukankan      html  css  js  c++  java
  • python 爬虫

    爬虫程序主要是把第n层网页的连接也下载下来
    主程序
    爬虫启动
    生成一个队列
    f(x) 循环 队列为空跳出
    网址出队列
    下载网页 找下一层连接
    添加到队列

    from sys import argv
    from os import makedirs,unlink,sep
    from os.path import dirname,exists,isdir,splitext
    from string import replace ,find,lower
    from htmllib import HTMLParser
    from urllib import urlretrieve
    from urlparse import urlparse,urljoin
    from formatter import DumbWriter,AbstractFormatter
    from cStringIO import StringIO
    import os,sys
    
    syspath=sys.argv[0]
    
    
    class retri(object):
        def __init__(self,url):
            self.url=url
            self.file=self.filename(url)
            
        def filename(self,url,deffile='index.htm'):
            parsedurl=urlparse(url,'http:',0)
            if parsedurl[2]=='':
                path=parsedurl[1]+'//index.htm'
            else:
                path=parsedurl[1]+parsedurl[2]
            ext=splitext(path)
            if ext[1]=='':
                if path[-1]=='/':
                    path+=deffile
                else:
                    path+='/'+deffile
            ldir=dirname(path)
    #        ldir=path
            if sep !='/':
                ldir =replace(ldir,'/',sep)
            if not isdir(ldir):
                if exists(ldir):
                    unlink(ldir)
                makedirs(ldir)
            return path
    #        return parsedurl[2]
        
        
        def download(self):
            try:
                retval=urlretrieve(self.url,self.file)
                return retval
            except IOError:
                retval=('*** error:invalid url "%s"'%self.url)
                return retval
                
        def parse_and_getlink(self):
            self.parser=
            (AbstractFormatter(DumbWriter(StringIO())))
            self.parser.feed(open(self.file).read())
            self.parser.close()
            return self.parser.anchorlist
        
        
        
    class crawler(object):
        count=0
        def __init__(self,url):
            self.q=[url]
            self.seen=[]
            self.dom=urlparse(url)[1]
        
        def get_page(self,url):
            r=retri(url)
            retval=r.download()
            if retval[0]=='*':
                print retval,'.. skipping parse'
                return
            crawler.count+=1
            print '
    (',crawler.count,')'
            print 'url:',url
            print 'file:',retval[0]
            self.seen.append(url)
            
            links=r.parse_and_getlink()
            for eachlink in links:
                if eachlink[:4]!='http' and find(eachlink,'://')==-1:
                    eachlink=urljoin(url,eachlink)
                print '* ',eachlink
                
                if find(lower(eachlink),'mailto:')!=-1:
                    print '... discarded,mailto link'
                    continue
                
                if eachlink not in self.seen:
                    if find(eachlink,self.dom)==-1:
                        print '...discarded,not in domain'
                    else:
                        if eachlink not in self.q:
                            self.q.append(eachlink)
                            print '...new,added to q'
                        else:
                            print '...discarded,already in q'
                            
                else:
                    print '... discarded,already processed'
            
            
            
        def go(self):
            while self.q:
                url=self.q.pop()
                self.get_page(url)
                
                
    def main():
        if len(argv)>1:
            url=argv[1]
        else:
            try:
                url=raw_input('enter starting url:')
            except(KeyboardInterrupt,EOFError):
                url=''
        if not url:return
        robot =crawler(url)
        robot.go()
        
    if __name__=='__main__':
        main()
            
            
        
        
  • 相关阅读:
    百度面试题:把数组排成最小的数
    面试题:在O(1)时间删除链表结点
    从第一字符串中删除第二个字符串中所有的字符
    在一个字符串中找到第一个只出现一次的字符
    大整数运算
    输出1到最大的N位数
    删除字符串中的数字并压缩字符串
    排列 或组合问题的解法(包含回溯法)
    卡特兰数(Catalan)简介
    编程之美-分层遍历二叉树
  • 原文地址:https://www.cnblogs.com/frog2008/p/6845306.html
Copyright © 2011-2022 走看看