zoukankan      html  css  js  c++  java
  • python3 调用 beautifulSoup 进行简单的网页处理

    python3 调用 beautifulSoup 进行简单的网页处理

    1. from bs4 import BeautifulSoup
    2. file = open('index.html','r',encoding='utf-16-le') #此处有坑!!!
    3. soup = BeautifulSoup(file,'lxml')
    4. print (soup) # 打印读出的内容
    5. print (' ------------- ')
    6. print (soup.get_text()) # 取所有标签中的文字
    7. print (' ------------- ')
    8. print (soup.prettify()) # 格式化输出
    1. # 以标签的形式输出
    2. print (soup.title)
    3. print (' ------------- ')
    4. print (soup.body)
    5. print (' ------------- ')
    6. print (soup.body.div)
    1. import re
    2. print (soup.find_all('br')) # 仅仅用来搜索标签
    3. print (' ------------- ')
    4. print (soup.find_all(re.compile('^b')))#可以使用正则表达式 b开头的标签
    5. print (' ------------- ')
    6. print (soup.find_all(id='wiz_custom_css'))
    7. print (' ------------- ')
    8. for strr in soup.strings: # 取所有下一级标签中的字符串 .stripped_strings可以去空白
    9. print (strr)
    10. print (' ------------- ')
    1. # 去除body中的标签,将结果保存于文件 待改进
    2. # kill all script and style elements
    3. for script in soup(["script", "style"]):
    4. script.extract() # rip current tap
    5. title_text = soup.title.get_text()
    6. str_text = ''
    7. for strr in soup.body.strings: # 取所有下一级标签中的字符串 .stripped_strings可以去空白
    8. str_text = str_text + strr + ' '
    9. print (str_text)
    10. if title_text == '':
    11. md_file = open('index.md','w')
    12. md_file.write(str_text)
    13. else:
    14. md_file = open(title_text+'.md','w')
    15. md_file.write(str_text)
    1. # 网上搜到的方式,<br/>标签没有转为换行,后面有另一种方式
    2. #print soup
    3. # kill all script and style elements
    4. for script in soup(["script", "style"]):
    5. script.extract() # rip current tap
    6. # get text
    7. text = soup.get_text()
    8. #print text + '____________'
    9. # break into lines and remove leading and trailing space on each
    10. # splitlines 按 三种标签分解为行
    11. # strip()移除首尾字符,参数默认为空格
    12. lines = (line.strip() for line in text.splitlines())
    13. # break multi-headlines into a line each
    14. chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
    15. # drop blank lines
    16. text = ' '.join(chunk for chunk in chunks if chunk) # 这个循环…………
    17. #wfile = open('aa.md','w')
    18. #wfile.write(text)
    19. print(text)




  • 相关阅读:
    python的特点
    epoll理解(转)
    数据库存储过程、触发器、连接
    Mysql的四种隔离级别
    linux指令
    利用asyncio(支持异步io)和协程实现单线程同步
    ubuntu安装codeblocks
    临界区与互斥量区别
    单链表的简单操作
    hdu 5475 An easy problem(暴力 || 线段树区间单点更新)
  • 原文地址:https://www.cnblogs.com/fly2wind/p/6872468.html
Copyright © 2011-2022 走看看