zoukankan      html  css  js  c++  java
  • python核心编程中网络爬虫的例子

      1 #!/usr/bin/env python
      2 
      3 import cStringIO                    #
      4 import formatter                    #
      5 from htmllib import HTMLParser         # We use various classes in these modules for parsing HTML.
      6 import httplib                        # We only need an exception from this module
      7 import os                            # This provides various file system functions
      8 import sys                            # We are just using argv for command-line arguments
      9 import urllib                        # We only need the urlretrieve()function for downloading Web pages
     10 import urlparse                        # We use the urlparse()and urljoin()functions for URL manipulation
     11 
     12 class Retriever(object):
     13     __slots__ = ('url','file')
     14     
     15     def __init__(self,url):
     16         self.url, self.file = self.get_file(url)
     17         
     18     def get_file(self, url, default='index.html'):
     19         'Create usable local filename from URL'
     20         parsed = urlparse.urlparse(url)                     # ParseResult(scheme='http', netloc='www.baidu.com', path='', params='', query='', fragment='')
     21         host = parsed.netloc.split('@')[-1].split(':')[0]    # 'www.baidu.com'
     22         filepath = '%s%s' % (host,parsed.path)                # 'www.baidu.com'
     23         if not os.path.splitext(parsed.path)[1]:            # ''
     24             filepath = os.path.join(filepath, default)        # 'www.baidu.com\index.html'
     25         linkdir = os.path.dirname(filepath)                    # 'www.baidu.com'
     26         if not os.path.isdir(linkdir):                        # False
     27             if os.path.exists(linkdir):                        # False
     28                 os.unlink(linkdir)                            
     29             os.makedirs(linkdir)                            # make a directory named by link directory on the hard disc
     30         return url, filepath
     31         
     32     def download(self):
     33         'Download URL to specific name file'
     34         try:
     35             retval = urllib.urlretrieve(self.url, self.file)
     36         except (IOError, httplib.InvalidURL) as e:
     37             retval = (('*** ERROR:bad URL "%s": %s' % (self.url,e)),)
     38         return retval
     39         
     40     def parse_links(self):
     41         'Parse out the links found in downloaded HTML file'
     42         f = open(self.file, 'r')
     43         data = f.read()
     44         f.close()
     45         parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO())))
     46         parser.feed(data)
     47         parser.close()
     48         return parser.anchorlist
     49         
     50 class Crawler(object):
     51     count = 0                                                # the number of objects downloaded from the internet
     52     
     53     def __init__(self, url):
     54         self.q = [url]                                        # a queue of links to download
     55         self.seen = set()                                    # a set containing all the links that we have seen(downloaded) already
     56         parsed = urlparse.urlparse(url)
     57         host = parsed.netloc.split('@')[-1].split(':')[0]
     58         self.dom = '.'.join(host.split('.')[-2:])            # 'b.a.i.d.u'
     59 
     60     def get_page(self, url, media=False):
     61         'Download page & parse links, add to queue if nec'
     62         r = Retriever(url)
     63         fname = r.download()[0]                                # 'www.baidu.com\index.html'
     64         if fname[0] == '*':                                    # 'w'
     65             print fname, '... skipping parse'
     66             return
     67         Crawler.count += 1                                    # 1
     68         print '
    (', Crawler.count, ')'                        # (1)
     69         print 'URL:', url                                    # URL: http://www.baidu.com
     70         print 'FILE:', fname                                # FILE: www.baidu.com\index.html
     71         self.seen.add(url)                                    # set(['http://www.baidu.com'])
     72         ftype = os.path.splitext(fname)[1]                    # '.html'
     73         if ftype not in ('.htm', '.html'):                    # False
     74             return
     75             
     76         for link in r.parse_links():
     77             if link.startswith('mailto:'):                    # False
     78                 print '... discarded, mailto link'
     79                 continue
     80             if not media:                                    # False
     81                 ftype = os.path.splitext(link)[1]
     82                 if ftype in ('.mp3','.mp4','.m4v','.wav'):
     83                     print '... discarded, media file'
     84                     continue
     85             if not link.startswith('http://'):                # False
     86                 link = urlparse.urljoin(url, link)
     87             print '*', link,
     88             if link not in self.seen:                        # True
     89                 if self.dom not in link:                    # False
     90                     print '... discarded, not in domain'
     91                 else:
     92                     if link not in self.q:
     93                         self.q.append(link)
     94                         print '... new, added to Q'
     95                     else:
     96                         print '... discarded, already in Q'
     97             else:
     98                 print '... discarded, already processed'
     99                 
    100     def go(self, media=False):
    101         'Process next page in queue (if any)'
    102         while self.q:
    103             url = self.q.pop()
    104             self.get_page(url, media)
    105             
    106 def main():
    107         if len(sys.argv) > 1:
    108             url = sys.argv[1]
    109         else:
    110             try:
    111                 url = raw_input('Enter starting URL:')
    112             except(KeyboardInterrupt, EOFError):
    113                 url = ''
    114         if not url:
    115             return
    116         if not url.startswith('http://') and not url.startswith('ftp://'):
    117             url = 'http://%s/' % url
    118         robot = Crawler(url)
    119         robot.go()
    120         
    121 if __name__ == '__main__':
    122         main()
  • 相关阅读:
    数据库 PSU,SPU(CPU),Bundle Patches 和 Patchsets 补丁号码快速参考 (文档 ID 1922396.1)
    从巡检备份失败排查解决数据库故障
    oracle rac自动生成awr
    解决 umount 时出现的 "Device is busy"
    修改sys密码与nbu备份脚本密码后,nbu备份报密码无效
    Oracle_RAC数据库GI的PSU升级(11.2.0.4.0到11.2.0.4.8)
    oracle补丁升级
    AIX上增加逻辑卷时报错误0516-787 extendlv: Maximum allocation for logical volume
    oracle11g dataguard 完全手册
    数据库节点1存储丢失
  • 原文地址:https://www.cnblogs.com/femaleprogramer/p/3848686.html
Copyright © 2011-2022 走看看