zoukankan html css js c++ java

学习进度-2

小说爬取

1.获得文章的每个章节链接地址
2.获得章节内容
3.保存到指定位置

import urllib
import urllib.request
import urllib.parse
import os
import lxml
import lxml.html
from bs4 import BeautifulSoup

oright_url = '小说网站网址'
cataher_url = urllib.parse.urljoin(oright_url,'目录章节')
chat_list = []
headers = {
    'Host':'www.xxx.com',
    'Referer':'http://www.xxx.com',
    'User-Agent':''
}

def geturl(): #得到小说章节的网址
    req = urllib.request.Request(cataher_url,headers=headers)
    html = lxml.html.parse(urllib.request.urlopen(req))
    hrefs = html.xpath('//dd/a/@href')      #解析网站网址
    for href in hrefs:                      #遍历的到的小说章节网址
        chat_list.append(urllib.parse.urljoin(oright_url,href)) # 拼接网址 将得到的网址保存到列表中
    print(chat_list)
geturl()

def geturlbody():         #获得网站正文并保存
    req = urllib.request.Request(cataher_url,headers=headers)
    html = lxml.html.parse(urllib.request.urlopen(req))
    for chat in chat_list:          #遍历所有网址
        html = urllib.request.urlopen(chat)     # 打开遍历得到的网址
        soup = BeautifulSoup(html,'lxml')       # 用BeautifulSoup解析网址
        title = soup.h1.string                  # 得到h1内的章节标题
        f = open ('小说名字.txt','a+',encoding='utf-8') #   小说保存的文件
        f.write('
'+title)                     # 保存小说章节的标题
        for content in soup.find_all('p'):      # 遍历小说得到节点为'p'中的内容
            f.write('
'+content.string)        # 逐行保存
        f.close()                               # 储存完成 关闭文档
        print('======>'+title)
geturlbody()

查看全文

相关阅读:
MySQL Error 1170 (42000): BLOB/TEXT Column Used in Key Specification Without a Key Length
递归枚举IHTMLDocument2的所有元素
 递归创建文件和文件夹
 通过ARP协议获取MAC地址
 监控文件(夹)的改变
 ATL和MFC的C++类和HWND的映射机制
 枚举当前环境中打开的所有IE
封装字符串的Format操作
 python decimal和fractions模块
 解决Output Designer字体问题

原文地址：https://www.cnblogs.com/Glzt/p/15435565.html