zoukankan      html  css  js  c++  java
  • Python通用网络爬虫脚本

      1 from sys import argv
      2 from os import makedirs,unlink,sep,mkdir
      3 from os.path import dirname,exists,isdir,splitext
      4 from string import replace,find,lower
      5 from htmllib import HTMLParser
      6 from urllib import urlretrieve
      7 from urlparse import urlparse,urljoin
      8 from formatter import DumbWriter,AbstractFormatter
      9 from cStringIO import StringIO
     10 
     11 
     12 class Retriever(object):
     13     def __init__(self,url):
     14         self.url = url
     15         self.file = 'E:installPython27\' + self.filename(url)
     16 
     17     def filename(self,url,deffile='index.htm'):
     18         parsedurl = urlparse(url,'http:',0)
     19         path = parsedurl[1] + parsedurl[2]
     20         ext = splitext(path) # seperate ext name
     21         if ext[1] == '':
     22             if path[-1] == '/':
     23                 path += deffile
     24             else:
     25                 path += '/' + deffile
     26 
     27         ldir = dirname(path)
     28         if sep != '/':
     29             ldir = replace(ldir,'/',sep)
     30         if not isdir(ldir):
     31             if exists(ldir): unlink(ldir)
     32             makedirs(ldir)
     33         return path
     34 
     35     def download(self):
     36         try:
     37             retval = urlretrieve(self.url,self.file)
     38         except IOError:
     39             retval = ('*** ERROR: invalid URL "%s"' %
     40                 self.url)
     41         return retval
     42 
     43     def parseAndGetLinks(self):
     44         self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     45         self.parser.feed(open(self.file).read())
     46         self.parser.close()
     47         return self.parser.anchorlist
     48 
     49 class Crawler(object):
     50     count = 0 # static downloaded page counter
     51 
     52     def __init__(self,url):
     53         self.q = [url]
     54         self.seen = []
     55         self.dom = urlparse(url)[1]
     56 
     57     def getPage(self,url):
     58         r = Retriever(url)
     59         retval = r.download()
     60         if retval[0] == '*':
     61             print retval,'...skipping parse'
     62             return
     63         Crawler.count += 1
     64         print '
    (',Crawler.count,')'
     65         print 'URL:',url
     66         print 'FILE:',retval[0]
     67         self.seen.append(url)
     68 
     69 
     70 
     71         links = r.parseAndGetLinks()
     72         for eachLink in links:
     73             if eachLink[:4] != 'http' and find(eachLink,'://') == -1:
     74                 eachLink = urljoin(url,eachLink)
     75 
     76             if find(lower(eachLink),'mailto:') != -1:
     77                 print '...discarded,mailto link'
     78                 continue
     79             if eachLink not in self.seen:
     80                 if find(eachLink,self.dom) == -1:
     81                     print '...discarded,not in domain'
     82                 else:
     83                     if eachLink not in self.q:
     84                         self.q.append(eachLink)
     85                         print '...new,added to Q'
     86                     else:
     87                         print '...discarded,already in Q'
     88             else:
     89                 print '...discarded,already processed'
     90 
     91 
     92 
     93     def go(self):#process links in queue
     94         while self.q:
     95             url = self.q.pop()
     96             self.getPage(url)
     97 
     98 
     99 
    100 def main():
    101     if len(argv) > 1:
    102         url = argv[1]
    103 
    104     else:
    105         try:
    106             url = raw_input('Enter starting URL:')
    107         except(KeyboardInerrupt,EOFError):
    108             url = ''
    109         if not url: return
    110         robot = Crawler(url)
    111         robot.go()
    112 
    113 if __name__ == '__main__':
    114     main()
  • 相关阅读:
    全民医疗
    SpringMVC
    Mybatis 缓存策略
    不要追涨杀跌
    我只认比特币
    ETH反思
    世界是熵增的
    切片最好还是传引用
    rxgo示例
    11月份的计划
  • 原文地址:https://www.cnblogs.com/elliottc/p/4947983.html
Copyright © 2011-2022 走看看