zoukankan      html  css  js  c++  java
  • python 爬虫

    爬虫程序主要是把第n层网页的连接也下载下来
    主程序
    爬虫启动
    生成一个队列
    f(x) 循环 队列为空跳出
    网址出队列
    下载网页 找下一层连接
    添加到队列

    from sys import argv
    from os import makedirs,unlink,sep
    from os.path import dirname,exists,isdir,splitext
    from string import replace ,find,lower
    from htmllib import HTMLParser
    from urllib import urlretrieve
    from urlparse import urlparse,urljoin
    from formatter import DumbWriter,AbstractFormatter
    from cStringIO import StringIO
    import os,sys
    
    syspath=sys.argv[0]
    
    
    class retri(object):
        def __init__(self,url):
            self.url=url
            self.file=self.filename(url)
            
        def filename(self,url,deffile='index.htm'):
            parsedurl=urlparse(url,'http:',0)
            if parsedurl[2]=='':
                path=parsedurl[1]+'//index.htm'
            else:
                path=parsedurl[1]+parsedurl[2]
            ext=splitext(path)
            if ext[1]=='':
                if path[-1]=='/':
                    path+=deffile
                else:
                    path+='/'+deffile
            ldir=dirname(path)
    #        ldir=path
            if sep !='/':
                ldir =replace(ldir,'/',sep)
            if not isdir(ldir):
                if exists(ldir):
                    unlink(ldir)
                makedirs(ldir)
            return path
    #        return parsedurl[2]
        
        
        def download(self):
            try:
                retval=urlretrieve(self.url,self.file)
                return retval
            except IOError:
                retval=('*** error:invalid url "%s"'%self.url)
                return retval
                
        def parse_and_getlink(self):
            self.parser=
            (AbstractFormatter(DumbWriter(StringIO())))
            self.parser.feed(open(self.file).read())
            self.parser.close()
            return self.parser.anchorlist
        
        
        
    class crawler(object):
        count=0
        def __init__(self,url):
            self.q=[url]
            self.seen=[]
            self.dom=urlparse(url)[1]
        
        def get_page(self,url):
            r=retri(url)
            retval=r.download()
            if retval[0]=='*':
                print retval,'.. skipping parse'
                return
            crawler.count+=1
            print '
    (',crawler.count,')'
            print 'url:',url
            print 'file:',retval[0]
            self.seen.append(url)
            
            links=r.parse_and_getlink()
            for eachlink in links:
                if eachlink[:4]!='http' and find(eachlink,'://')==-1:
                    eachlink=urljoin(url,eachlink)
                print '* ',eachlink
                
                if find(lower(eachlink),'mailto:')!=-1:
                    print '... discarded,mailto link'
                    continue
                
                if eachlink not in self.seen:
                    if find(eachlink,self.dom)==-1:
                        print '...discarded,not in domain'
                    else:
                        if eachlink not in self.q:
                            self.q.append(eachlink)
                            print '...new,added to q'
                        else:
                            print '...discarded,already in q'
                            
                else:
                    print '... discarded,already processed'
            
            
            
        def go(self):
            while self.q:
                url=self.q.pop()
                self.get_page(url)
                
                
    def main():
        if len(argv)>1:
            url=argv[1]
        else:
            try:
                url=raw_input('enter starting url:')
            except(KeyboardInterrupt,EOFError):
                url=''
        if not url:return
        robot =crawler(url)
        robot.go()
        
    if __name__=='__main__':
        main()
            
            
        
        
  • 相关阅读:
    根据屏幕宽度适应屏幕样式
    设计模式之命令模式
    动态代理的使用以及其实现机制
    PLSQL链接不上oracle
    struts2自定义结果类型demo
    Tomcat虚拟路径
    SEQUENCE序列
    mysql导出数据库中建表语句和数据
    Tomcat6 启动时 Cannot assign requested address: JVM_Bind
    sql常用命令
  • 原文地址:https://www.cnblogs.com/frog2008/p/6845306.html
Copyright © 2011-2022 走看看