zoukankan      html  css  js  c++  java
  • Python_网页爬虫

     1 import sys
     2 import multiprocessing
     3 import re
     4 import os
     5 import urllib.request as lib
     6 
     7 def craw_links( url,depth,keyword,processed):
     8     ''' url:the url to craw
     9         deth:the current depth to craw
    10         keyword:the tuple of keywords to focus
    11         pool:process pool
    12     '''
    13 
    14     contents=[]
    15     if url.startswith(('htpp://','https://')):
    16         if url not in processed:
    17             #mark this url as processed
    18             processed.append(url)
    19         else:
    20             #avoid prossing the same url again
    21             return
    22         print('Crawing '+url+'...')
    23         fp = lib.urlopen(url)
    24         #python3 returns bytes,so need to decode
    25         contents = fp.read()
    26         contents_decoded = contents.decode('UTF-8')
    27         fp.close()
    28         pattern = '|'.join(keyword)
    29         #if this page contains certain keywords,save it to a file
    30         flag = False
    31         if pattern:
    32             searched = re.search(pattern,contents_decoded)
    33         else:
    34             #if the keywords to filter is not given,save current page
    35             flag = True
    36         if flag or searched:
    37             with open('craw\'+url.replace(':','_').replace('/','_'),'wb')  as fp:
    38                 fp.write(contents)
    39         #find all the links in the current page
    40         links = re.findall('href="(.*?)"',contents_decoded)
    41         #craw all links in the current page
    42         for link in links:
    43             #consider the relative path
    44             if not link.startswith(('http://','https://')):
    45                 try:
    46                     index=url.rindex('/')
    47                     link = url[0:index+1]+link
    48                 except:
    49                     pass
    50             if depth>0 and link.endswith(('.htm','.html')):
    51                 craw_links(link,depth-1,keyword,processed)
    52 
    53 if __name__ == '__main__':
    54     processed = []
    55     keywords = ('KeyWord1','KeyWord2')
    56     if os.path.exists('craw') or not os.path.isdir('craw'):
    57         os.mkdir('craw')
    58     craw_links(r'http://docs.python.org/3/library/index.html',1,keywords,processed)
  • 相关阅读:
    LeetCode——Add Binary
    UVA
    mac平台adb、tcpdump捕手android移动网络数据包
    代码农民提高生产力
    Atitit. 拉开拉链zip文件 最佳实践实施 java c# .net php
    Arc Object开发,概述2
    ArcGIS Object开发,概述
    GDI 编程基础简介
    科目三考试档位与速度匹配总结、及考试操作技巧
    倾斜摄影
  • 原文地址:https://www.cnblogs.com/cmnz/p/7096607.html
Copyright © 2011-2022 走看看