zoukankan html css js c++ java

Python_网页爬虫

 1 import sys
 2 import multiprocessing
 3 import re
 4 import os
 5 import urllib.request as lib
 6 
 7 def craw_links( url,depth,keyword,processed):
 8     ''' url:the url to craw
 9         deth:the current depth to craw
10         keyword:the tuple of keywords to focus
11         pool:process pool
12     '''
13 
14     contents=[]
15     if url.startswith(('htpp://','https://')):
16         if url not in processed:
17             #mark this url as processed
18             processed.append(url)
19         else:
20             #avoid prossing the same url again
21             return
22         print('Crawing '+url+'...')
23         fp = lib.urlopen(url)
24         #python3 returns bytes,so need to decode
25         contents = fp.read()
26         contents_decoded = contents.decode('UTF-8')
27         fp.close()
28         pattern = '|'.join(keyword)
29         #if this page contains certain keywords,save it to a file
30         flag = False
31         if pattern:
32             searched = re.search(pattern,contents_decoded)
33         else:
34             #if the keywords to filter is not given,save current page
35             flag = True
36         if flag or searched:
37             with open('craw\'+url.replace(':','_').replace('/','_'),'wb')  as fp:
38                 fp.write(contents)
39         #find all the links in the current page
40         links = re.findall('href="(.*?)"',contents_decoded)
41         #craw all links in the current page
42         for link in links:
43             #consider the relative path
44             if not link.startswith(('http://','https://')):
45                 try:
46                     index=url.rindex('/')
47                     link = url[0:index+1]+link
48                 except:
49                     pass
50             if depth>0 and link.endswith(('.htm','.html')):
51                 craw_links(link,depth-1,keyword,processed)
52 
53 if __name__ == '__main__':
54     processed = []
55     keywords = ('KeyWord1','KeyWord2')
56     if os.path.exists('craw') or not os.path.isdir('craw'):
57         os.mkdir('craw')
58     craw_links(r'http://docs.python.org/3/library/index.html',1,keywords,processed)

查看全文

相关阅读:
[LeetCode 1029] Two City Scheduling
POJ 2342 Anniversary party （树形DP入门）
Nowcoder 106 C.Professional Manager(统计并查集的个数)
2018 GDCPC 省赛总结
 CF 977 F. Consecutive Subsequence
Uva 12325 Zombie's Treasure Chest （贪心，分类讨论）
Poj 2337 Catenyms（有向图DFS求欧拉通路）
POJ 1236 Network of Schools (强连通分量缩点求度数)
POJ 1144 Network （求割点）
POJ 3310 Caterpillar（图的度的判定）

原文地址：https://www.cnblogs.com/cmnz/p/7096607.html