zoukankan html css js c++ java

python3 调用 beautifulSoup 进行简单的网页处理

from bs4 import BeautifulSoup
file = open('index.html','r',encoding='utf-16-le') #此处有坑！！！
soup = BeautifulSoup(file,'lxml')
print (soup)  # 打印读出的内容
print ('
 ------------- 
')
print (soup.get_text())  # 取所有标签中的文字
print ('
 ------------- 
')
print (soup.prettify()) # 格式化输出

# 以标签的形式输出
print (soup.title)
print ('
 ------------- 
')
print (soup.body)
print ('
 ------------- 
')
print (soup.body.div)

import re
print (soup.find_all('br'))  # 仅仅用来搜索标签 
print ('
 ------------- 
')
print (soup.find_all(re.compile('^b')))#可以使用正则表达式  以b开头的标签
print ('
 ------------- 
')
print (soup.find_all(id='wiz_custom_css'))
print ('
 ------------- 
')
for strr in soup.strings:  # 取所有下一级标签中的字符串  .stripped_strings可以去空白
    print (strr)
print ('
 ------------- 
')

# 去除body中的标签，将结果保存于文件  待改进
# kill all script and style elements
for script in soup(["script", "style"]):
    script.extract()    # rip current tap
title_text = soup.title.get_text()
str_text = ''
for strr in soup.body.strings:  # 取所有下一级标签中的字符串  .stripped_strings可以去空白
    str_text = str_text + strr + '
'
print (str_text)
if title_text == '':
    md_file = open('index.md','w')
    md_file.write(str_text)
else:
    md_file = open(title_text+'.md','w')
    md_file.write(str_text)

# 网上搜到的方式，<br/>标签没有转为换行，后面有另一种方式
#print soup
# kill all script and style elements
for script in soup(["script", "style"]):
    script.extract()    # rip current tap
# get text
text = soup.get_text()
#print text + '____________'
# break into lines and remove leading and trailing space on each
# splitlines 按
 
 
三种标签分解为行 
# strip()移除首尾字符，参数默认为空格
lines = (line.strip() for line in text.splitlines()) 
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) 
# drop blank lines
text = '
'.join(chunk for chunk in chunks if chunk) # 这个循环…………
#wfile = open('aa.md','w')
#wfile.write(text)
print(text)

From WizNote

查看全文

相关阅读:
数组对象---数据存储
 运行vue项目时，无法自动打开页面怎么办？
数组扁平化
 数组去重
 CSS-----------text-transform
CSS3-----width：max-content,min-content和fit_content属性
 可迭代对象
 bit和byte的区别？
前端常见跨域问题？
HackerRank "Lucky Numbers"

原文地址：https://www.cnblogs.com/fly2wind/p/6872468.html