参考:http://www.freebuf.com/news/special/96763.html
相关资料:http://www.jb51.net/article/65287.htm
1、Python3 win7安装BeautifulSoup
BeautifulSoup中文文档:http://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html
BeautifulSoup下载:http://www.crummy.com/software/BeautifulSoup/
解压,运行cmd执行:python setup.py install即可
2、导入beatifulsoup库 :from bs4 import BeautifulSoup
传入数据,建立对象: soup = BeautifulSoup(data),
操作soup,完成需求解析。
3、示例代码:
1 from bs4 import BeautifulSoup 2 from urllib import request 3 import re 4 5 web = request.urlopen('http://www.freebuf.com') 6 # 没有特别指明解析器,bs4使用了它认为最好的解析器,但是在不同的环境下运行,可能解析器是不一样的。 7 # 如果没有'html.parser',会有warning提示,表明了bs4的自动选择解析器来解析的特性。 8 soup = BeautifulSoup(web.read(),'html.parser') 9 tags_a = soup.find_all(name='a', attrs={'href': re.compile('^https?://')}) 10 11 for tag_a in tags_a: 12 print(tag_a['href'])
4、利用BeautifulSoup获取网站的sitemap:
1 # coding:utf-8 2 # 获取整个网站的sitemap 3 4 import urllib.request 5 import urllib.error 6 from urllib.parse import urlparse 7 from bs4 import BeautifulSoup 8 import time 9 import datetime 10 11 url = input('请输入扫描的url:') 12 domain = input('请输入包含的域名:') 13 sites = set() 14 15 16 # 获取一个页面的所有url 17 def get_local_pages(url, domain): 18 pages = set() 19 global sites 20 repeat_time = 0 21 22 # 解析传入的url为后面相对路径拼接用 23 parse_url = urlparse(url) 24 25 # 防止url读取卡住:自动重读5次 26 while True: 27 try: 28 print('Ready to Open the web!') 29 time.sleep(1) 30 print('Opening the web : %s' % url) 31 web = urllib.request.urlopen(url=url, timeout=20) 32 print('Success to Open the web!') 33 break 34 except urllib.error.URLError as e: 35 print('Open Url Error:',e) 36 print('Open url Failed!!!Repeat!') 37 time.sleep(1) 38 repeat_time += 1 39 if repeat_time == 5: 40 return 41 42 soup = BeautifulSoup(web.read()) 43 tags = soup.find_all(name='a') 44 45 for tag in tags: 46 # 避免参数传递异常 47 try: 48 ret = tag['href'] 49 except: 50 print('Maybe not the attr : href') 51 continue 52 53 parse_page = urlparse(ret) 54 55 # 1 url不为空(协议,域名,路径) 56 if parse_page[0] is '' and parse_page[1] is '' and parse_page[2] is '': 57 print('Bad Page(协议域名路径均为空):%s' % ret) 58 continue 59 60 # 2 协议不为空,判断合法性 61 if parse_page[0] is not '' and 'http' not in parse_page[0]: 62 print('Bad Page(协议不合法,非http):%s' % ret) 63 continue 64 65 # 3 域名不为空,domain要包含在域名中 66 if parse_page[1] is not '' and domain not in parse_page[1]: 67 print('Bad Page(域名不合法,非%s):%s' % (domain, ret)) 68 continue 69 70 # 4 协议为空,域名不为空(拼接ret),例如://caipiao.taobao.com 71 if parse_page[0] is '' and parse_page[1] is not '': 72 print('Fix page(仅域名存在): %s' % ret) 73 newpage = parse_url[0] + ':' + ret 74 if newpage not in sites: 75 print('Add Fix Page(拼接域名):%s' % newpage) 76 pages.add(newpage) 77 continue 78 79 # 5 协议域名为空,路径不为空(拼接ret) 80 if parse_page[0] is '' and parse_page[1] is '': 81 print('Fix page(仅路径存在): %s' % ret) 82 temp_page = parse_url[0] + '://' + parse_url[1] + '/' + ret 83 # 保持URL的干净 84 newpage = temp_page[:8] + temp_page[8:].replace('//', '/') 85 if newpage not in sites: 86 print('Add Fix Page(拼接路径):%s' % newpage) 87 pages.add(newpage) 88 continue 89 90 # 整理输出 91 newpage = ret 92 if newpage not in sites: 93 print('Add New Page:%s' % newpage) 94 pages.add(newpage) 95 96 return pages 97 98 99 # dfs 算法遍历全站(目前中小型网站可用,待完善) 100 def dfs(pages, domain): 101 global sites 102 if pages in sites: 103 return 'Success!' 104 105 # visited = set() 106 # sites = set.union(sites,pages) 107 for page in pages: 108 if page not in sites: 109 sites.add(page) 110 get_pages = get_local_pages(page, domain) 111 dfs(get_pages, domain) 112 return 113 114 t1 = datetime.datetime.now() 115 pages = get_local_pages(url, domain) 116 dfs(pages,domain) 117 text_name = domain + '全站扫描.txt' 118 with open(text_name, 'a') as f: 119 f.write(' ' + str(datetime.datetime.now()) + ' ') 120 for i in sites: 121 with open(text_name, 'a') as f: 122 f.write(i + ' ') 123 124 with open(text_name, 'a') as f: 125 f.write(' 用时:' + str(datetime.datetime.now() - t1) + ' ') 126 127 sitemap
5、基本知识点
Bs4的基本api的使用,关于beautifulSoup的基本使用方法,我这里需要介绍在下面的脚本中我使用到的方法:
Soup = BeautifulSoup(data)
#构建一个解析器
Tags = Soup.findAll(name,attr)
我们重点要讲findAll方法的两个参数:name和attr
Name: 指的是标签名,传入一个标签名的名称就可以返回所有固定名称的标签名
Attr: 是一个字典存储需要查找的标签参数,返回对应的标签
Tag.children 表示获取tag标签的所有子标签
Tag.string 表示获取tag标签内的所有字符串,不用一层一层索引下去寻找字符串
Tag.attrs[key] 表示获取tag标签内参数的键值对键为key的值
Tag.img 表示获取tag标签的标签名为
img的自标签(一个)
6、利用BeautifulSoup获取58页面的指定信息(python2.7)
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 4 import urllib 5 import urllib2 6 from bs4 import BeautifulSoup 7 8 url = 'http://ny.58.com/zufang/24584108096437x.shtml?qq-pf-to=pcqq.c2c' 9 10 # rq = urllib2.Request(url) 11 # print rq 12 rp = urllib.urlopen(url) 13 html = rp.read() 14 soup = BeautifulSoup(html) 15 16 # 获取标题 17 title = soup.find_all(name='h1', attrs={'class': 'main-title font-heiti'}) 18 for data in title: 19 data_title = data.get_text() 20 print data_title 21 22 # 获取租金 23 primary = soup.find_all(name='em', attrs={'class': 'house-price'}) 24 for data in primary: 25 data_primary = data.get_text() 26 print data_primary 27 28 # 获取房屋 29 house_type = soup.find_all(name='div', attrs={'class': 'fl house-type c70'}) 30 for data in house_type: 31 temp_type = data.get_text().replace('-', ' ') 32 temp_type = ' '.join(temp_type.split()) 33 print temp_type 34 # data_type_list = [] 35 # for d in temp_type: 36 # data_type_list.append(d) 37 # print data_type_list 38 39 40 # 获取小区 41 xiaoqu = soup.find_all(name='div', attrs={'class': 'fl xiaoqu c70'}) 42 for data in xiaoqu: 43 data_xiaoqu = data.get_text().strip() 44 print data_xiaoqu 45 46 # 获取配置 47 config = soup.find_all(name='li', attrs={'class': 'house-primary-content-li clearfix person-config'}) 48 for data in config: 49 data_config = data.div.get_text().replace('-',' ') 50 data_config = ' '.join(data_config.split()) 51 print data_config 52 53 # 获取联系人 54 contact = soup.find_all(name='li', attrs={'class': 'house-primary-content-li clearfix person-contact'}) 55 for data in contact: 56 data_contact = data.div.span.get_text() 57 print data_contact 58 59 60 # 写入文件 61 # with open('58_test1.txt','w') as f: 62 # f.write('标题:'+data_title.decode('gbk')) 63 # f.write('租金:' + data_primary)