网页爬虫常用来在互联网上爬取感兴趣的页面或文件,结合数据处理与分析技术可以得到更深层次的信息。下面的代码实现了网页爬虫,可以抓取指定网页中的所有链接,并且可以指定关键字和抓取深度。
1 import sys
2 import multiprocessing
3 import re
4 import os
5 import urllib.request as lib
6
7 def craw_links(url,depth,keywords,processed):
8 '''
9 :param url: 要爬取的网址
10 :param depth: 爬取深度
11 :param keywords: 要爬取的关键字组成的元组
12 :param procdssed: 进程池
13 :return:
14 '''
15
16 contents = []
17
18 if url.startswith(('http://','https://')):
19 if url not in processed:
20 #make this url as processed
21 processed.append(url)
22 else:
23 #avoid processing the same url again
24 return
25
26 print('Crawing ' + url + '...')
27 fp = lib.urlopen(url) #向url 发出请求
28
29 #Python3 returns bytes,so need to decode
30 contents_decoded = fp.read().decode('utf-8')
31 fp.close() #至此已经读取爬取的网页文本内容
32
33 pattern = '|'.join(keywords)
34
35 #if this page contains certain keywords,save it to a file
36 flag = False
37 if pattern:
38 searched = re.search(pattern,contents_decoded) #用正则表达式去返回的网页文本中匹配关键字
39 else:
40 #if the keywords to filter is not given,save current page
41 flag = True
42
43 if flag or searched:
44 with open('craw\' + url.replace(':','_').replace('/','_'),'w') as fp:
45 fp.writelines(contents)
46
47 #find all the links in the current page
48 links = re.findall('href="(.*?)"',contents_decoded)
49
50 #craw all links in the current page
51 for link in links:
52 #consider the relative path
53 if not link.startswith(('http://','https://')):
54 try:
55 index = url.rindex('/')
56 link = url[0:index+1] + link
57 except:
58 pass
59 if depth > 0 and link.endswith(('.htm','.html')):
60 craw_links(link,depth-1,keywords,processed)
61
62 if __name__ == '__main__':
63 processed = []
64 keywords=('datetime','KeyWord2')
65 if not os.path.exists('craw') or not os.path.isdir('craw'):
66 os.mkdir('craw')
67 craw_links(r'https://docs.python.org/3/library/index.html',1,keywords,processed)