zoukankan      html  css  js  c++  java
  • BeautifulSoup的基本用法

     1 # -*- coding:UTF-8 -*-
     2 from urllib import request
     3 from bs4 import BeautifulSoup
     4 import re
     5 import sys
     6 
     7 if __name__ == "__main__":
     8     #创建txt文件
     9     file = open('一念永恒.txt', 'w', encoding='utf-8')
    10     #一念永恒小说目录地址
    11     target_url = 'http://www.biqukan.com/1_1094/'
    12     #User-Agent
    13     head = {}
    14     head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'
    15     target_req = request.Request(url = target_url, headers = head)
    16     target_response = request.urlopen(target_req)
    17     target_html = target_response.read().decode('gbk','ignore')
    18     #创建BeautifulSoup对象
    19     listmain_soup = BeautifulSoup(target_html,'lxml')
    20 
    21     #搜索文档树,找出div标签中class为listmain的所有子标签
    22     chapters = listmain_soup.find_all('div',class_ = 'listmain')
    23     #使用查询结果再创建一个BeautifulSoup对象,对其继续进行解析
    24     download_soup = BeautifulSoup(str(chapters), 'lxml')
    25     #计算章节个数
    26     numbers = (len(download_soup.dl.contents) - 1) / 2 - 8
    27     index = 1
    28     #开始记录内容标志位,只要正文卷下面的链接,最新章节列表链接剔除
    29     begin_flag = False
    30     #遍历dl标签下所有子节点
    31     for child in download_soup.dl.children:
    32         #滤除回车
    33         if child != '
    ':
    34             #找到《一念永恒》正文卷,使能标志位
    35             if child.string == u"《一念永恒》正文卷":
    36                 begin_flag = True
    37             #爬取链接并下载链接内容
    38             if begin_flag == True and child.a != None:
    39                 download_url = "http://www.biqukan.com" + child.a.get('href')
    40                 download_req = request.Request(url = download_url, headers = head)
    41                 download_response = request.urlopen(download_req)
    42                 download_html = download_response.read().decode('gbk','ignore')
    43                 download_name = child.string
    44                 soup_texts = BeautifulSoup(download_html, 'lxml')
    45                 texts = soup_texts.find_all(id = 'content', class_ = 'showtxt')
    46                 soup_text = BeautifulSoup(str(texts), 'lxml')                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
    47                 write_flag = True
    48                 file.write(download_name + '
    
    ')
    49                 #将爬取内容写入文件
    50                 for each in soup_text.div.text.replace('xa0',''):
    51                     if each == 'h':
    52                         write_flag = False
    53                     if write_flag == True and each != ' ':
    54                         file.write(each)
    55                     if write_flag == True and each == '
    ':
    56                         file.write('
    ')
    57                 file.write('
    
    ')
    58                 #打印爬取进度
    59                 sys.stdout.write("已下载:%.3f%%" % float(index/numbers) + '
    ')
    60                 sys.stdout.flush()
    61                 index += 1
    62     file.close()

    >>> for link in soup.find_all('a'):
    ... print(link.get('href'))
    #用于爬取a标签的链接

      

    Beautiful Soup 4.4.0 文档链接:http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/

    已下文章来自博客园大佬:http://www.cnblogs.com/sakura3/p/8460224.html(为了复习方便,搬一下,谢谢)

    爬小说:

     1 #!/usr/bin/python
     2 # -*- coding: UTF-8 -*-
     3 import requests
     4 from bs4 import BeautifulSoup
     5 # get_url_list 获取所有章节的URL,在一个list里
     6 def get_url_list(url):
     7     content = requests.get(url).content           #获取页面内容
     8     soup = BeautifulSoup(content,'lxml')          #Beautifulsoup 实例化对象
     9     url_list = []                                #空的url_list 数组
    10     # urls = soup.find('div',{'id':'list'}).find('dl').find_all('dd')
    11     urls = soup.select('#list > dl > dd > a')    # 根据页面选择到URL ,还可以urls = soup.find('div',{'id':'list'}).find('dl').find_all('dd')
    12     for i in urls:          #遍历里面的每一章的URL
    13         i = i.get('href')   #获取URL
    14         # print(i)      
    15         i = 'http://www.biquge.com.tw' + i   #分析文章组成,形成最终的URL
    16         url_list.append(i)    #添加到url_list 里面去
    17     # print (url_list)
    18     return url_list
    19 # 获取这一章的内容
    20 def get_data(url):
    21     content = requests.get(url).content
    22     soup = BeautifulSoup(content, 'lxml')
    23     f = open(r'C:UsersHBXDocumentsstaudyHMXX.txt','a+',encoding='utf-8')  #不加utf-8 会有编码报错
    24     text_name = soup.find('div',{'class':'bookname'}).find('h1').text     #获得章节名字
    25     # text_content = soup.select('#content')
    26     text_content = soup.find('div',{'id':'content'}).get_text()    #获得章节内容  ,还有一种select css 选择的获取章节内容的方式
    27     book =text_name+ '
    ' + text_content    #整体的一章
    28     # print(book)
    29     f.write((book)+'
    ')   #换行写入
    30     f.close()   #关闭文件
    31     # for x in text_content:
    32     #     a = x.text.replace('readx();', '')
    33     #     print(a)
    34 
    35 
    36 
    37 if __name__ =='__main__':
    38     url = 'http://www.biquge.com.tw/18_18049/'   #笔趣阁的小说目录页面
    39     url_list = get_url_list(url)   #获取了所有的url
    40     for i in url_list:    # 循环一章url
    41         get_data(i)      #获取文章内容

     

  • 相关阅读:
    Linux常用命令大全(非常全!!!)
    TCP连接的建立与释放(三次握手与四次挥手)
    TCP/IP Http 和Https socket之间的区别
    redis持久化方法对比分析
    关于HTTP协议,一篇就够了
    远程桌面不能拷贝文件的问题
    URLDecoder: Incomplete trailing escape (%) pattern
    利用pdf2swf将PDF转换成SWF
    Oracle删除当前用户下所有的表的方法
    JS简单验证密码强度
  • 原文地址:https://www.cnblogs.com/kangdong/p/8489532.html
Copyright © 2011-2022 走看看