zoukankan html css js c++ java

Beautifusoup

text = soup.find('div', {'class': 'mulu'})  #查找目录，坑死我了。就这个东西，
知乎上看别人写的爬取网络小说，这个最适合我。
我一开始老是使用beautifulsoup ，find老是不准，原来是这个样子的。这个格式的。真是太无情了。
不准的原因还有一个，那就是解析的页面错了。愚蠢的人类
继续学习。

作者：周小馬
链接：https://www.zhihu.com/question/48900224/answer/266561350
来源：知乎
著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。

# -*- coding: utf-8 -*-
# 导入俩库，足够了
import requests
from bs4 import BeautifulSoup

url = "http://www.88dushu.com/xiaoshuo/2/2392/index.html"  # 小说索引页
url_text = "http://www.88dushu.com/xiaoshuo/2/2392/"  # 小说主页面，后面还需加上每一章的链接
page = range(174, 294)  # 第三部是174-294页

# 定义第一个函数， 用来爬取每一章的url和章节名
def get_url(url):
    content = requests.get(url).content
    soup = BeautifulSoup(content)
    # 找到每一章所在的位置，都在'li'这个标签
    text = soup.find('div', {'class': 'mulu'}).find('ul').find_all('li')
    urls = []
    titles = []
    for i in page:  # 循环第三部的每一章
        url1 = text[i].find('a').get('href')
        title = text[i].find('a').get_text()
        urls.append(url1)
        titles.append(title)
    #返回链接和章节名
    return urls, titles

# 定义第二个函数，用来得到每一章的内容，并存入TXT文件
def get_text():
    # 从上一个函数获取链接和章节名
    urls, titles = get_url(url)
    # 文本文件设置为追加模式'a'，避免前面的内容被覆盖
    f = open('d:/kuanglong.txt', 'a')
    for i in range(len(urls)):
        url_tt = url_text + str(urls[i])  # 每一章完整的链接
        content = requests.get(url_tt).content
        soup = BeautifulSoup(content)
        # 得到一章的内容
        text = soup.find('div', {'class': 'yd_text2'}).get_text()
        # 将得到的内容清洗，去除广告
        text = text.replace("****[ 请到  六九中文阅读最新章节 ]****", '').replace('[****/[  六九中文急速更新 ]****/]', '')
            .replace('xa0', '')
        # 章节名 + 章节内容
        texts = titles[i] + text
        # 写入txt文件
        f.write(texts)
    # 循环完之后关闭文件句柄
    f.close()

# 运行程序
if __name__ == '__main__':
    get_text()

获取文档链接

for link in soup.find_all('a'):
    print(link.get('href'))
    # http://example.com/elsie
    # http://example.com/lacie
    # http://example.com/tillie

获取文档文字内容

print(soup.get_text())
# The Dormouse's story
#
# The Dormouse's story
#
# Once upon a time there were three little sisters; and their names were
# Elsie,
# Lacie and
# Tillie;
# and they lived at the bottom of a well.
#
# ...

慢不要快，稳才是对的。

查看全文

相关阅读:
ObjectiveC字符串处理
 分享 10 个 jQuery 的语言翻译插件
 30 个实用的 jQuery 选项卡/导航教程推荐
 iphoneCocos2D游戏开发
 cocos2d和unity3d的比较
 将NSString转换编码集变为GBK或GB2312
超过 40 款很有用而且很新的 jQuery 插件
 表格单元的表现形式
 ShareKit
UI Prototype Design IDE（界面原型设计工具）

原文地址：https://www.cnblogs.com/sakura3/p/8460224.html