zoukankan      html  css  js  c++  java
  • 爬虫程序

    下面是一个简单的爬虫程序。

    #!/usr/bin/env python
    
    from sys import argv
    from os import makedirs, unlink, sep
    from os.path import dirname, exists, isdir, splitext
    from string import replace, find, lower
    #from htmllib import HTMLParser
    from urllib import urlretrieve
    from urlparse import urlparse, urljoin
    from formatter import DumbWriter, AbstractFormatter
    from cStringIO import StringIO
    from HTMLParser import HTMLParser
    '''下面的三行代码是为了设置默认编码 utf8.如果不这样做,python会默认用ascii编码方式去解析,那么如果遇到unicode的编码就出错了。这里先import sys后 reload sys是因为,sys在默认导入的时候通常会删掉setdefaultencoding这个函数,所以需要用reload加载一下''' import sys reload(sys) sys.setdefaultencoding('utf8') class RetrieveURL(HTMLParser):#我们用HTMLParser新生成了一个类 def __init__(self): HTMLParser.__init__(self) self.anchorlist=[]#重写__init__函数的唯一目的就是对该类的对象增加一个anchorlist def handle_starttag(self, tag, attrs):#重写handle_starttag函数,让它在遇到<A>标签的时候把href属性代表的超链接记录在anchorlist中 if tag=='a' or tag=='A': for t in attrs : if t[0] == 'href' or t[0]=='HREF': self.anchorlist.append(t[1]) class Retriever(object):# download Web pages def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile='index.htm'): parsedurl = urlparse(url, 'http:', 0) ## parse path path = parsedurl[1] + parsedurl[2] ext = splitext(path) if ext[1] == '': # no file, use default. ( what kind of situation this could be? https://www.baidu.com/file1) if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) # local directory if sep != '/': # os-indep. path separator ldir = replace(ldir, '/', sep) if not isdir(ldir): # create archive dir if nec. if exists(ldir): unlink(ldir) print 'ldir is ',ldir makedirs(ldir) return path def download(self): # download Web page try: retval = urlretrieve(self.url, self.file) except IOError: retval = ('*** ERROR: invalid URL "%s"' %self.url,) return retval return retval '''def parseAndGetLinks(self):# parse HTML, save links self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist''' def parseAndGetLinks(self): self.parser=RetrieveURL() self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist class Crawler(object):# manage entire crawling process count = 0 # static downloaded page counter def __init__(self, url): self.q = [url] self.seen = [] self.dom = urlparse(url)[1] def getPage(self, url): r = Retriever(url) retval = r.download() if retval[0] == '*': # error situation, do not parse print retval, '... skipping parse' return Crawler.count += 1 print ' (', Crawler.count, ')' print 'URL:', url print 'FILE:', retval[0] self.seen.append(url) links = r.parseAndGetLinks() # get and process links for eachLink in links: if eachLink[:4] != 'http' and find(eachLink, '://') == -1: eachLink = urljoin(url, eachLink) print '* ', eachLink, if find(lower(eachLink), 'mailto:') != -1: print '... discarded, mailto link' continue if eachLink not in self.seen: if find(eachLink, self.dom) == -1: print '... discarded, not in domain' else: if eachLink not in self.q: self.q.append(eachLink) print '... new, added to Q' else: print '... discarded, already in Q' else: print '... discarded, already processed' def go(self):# process links in queue while self.q: url = self.q.pop() self.getPage(url) def main(): if len(argv) > 1: url = argv[1] else: try: url = raw_input('Enter starting URL: ') except (KeyboardInterrupt, EOFError): url = '' if not url: return robot = Crawler(url) robot.go() if __name__ == '__main__': main()

      

  • 相关阅读:
    Jersey Politics
    网络流——最小费用最大流
    网络流——最大流Dinic算法
    【洛谷2756】飞行员配对方案问题(二分图匹配,网络流24题)
    状压dp入门
    2018九江市赛
    [CQOI2007]余数求和
    CSAPC2008 skyline
    [ZJOI2009]函数 题解
    由不定方程想到的——数论选讲
  • 原文地址:https://www.cnblogs.com/kramer/p/3766090.html
Copyright © 2011-2022 走看看