# -*- coding: utf-8 -*-
"""
Created on Tue Dec 1 12:31:07 2020
@author: zhaolulu
"""
import pandas as pd
import requests
from lxml import etree
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
}
def url_read(url):
try:
reponse = requests.get(url,headers=headers)
except:
print('failed')
return reponse.content.decode('utf-8')
if __name__=='__main__':
#笔趣阁小说网站
url='http://www.xbiquge.la/'
text = url_read(url)
print("============================================")
selector=etree.HTML(text)
#这个是主页上最新小说的url
ret=selector.xpath('//*[@id="newscontent"]/div[1]/ul/li/span[2]/a//@href')
for note_url in ret:
print(note_url)
# 这选择了其中一条url 做测试
#print(ret[0]) # http://www.xbiquge.la/62/62585/
n_text = url_read('http://www.xbiquge.la/62/62585/')
n_html=etree.HTML(n_text)
xpath_ret = n_html.xpath('//*[@id="list"]/dl/dd/a/@href')
index =0;
for t_url in xpath_ret:
#具体的章节内容
f_url = 'http://www.xbiquge.la'+t_url
print(f_url)
article = url_read(f_url)
article_text=etree.HTML(article)
article_detail=article_text.xpath('//*[@id="content"]/text()')
if len(article_detail) > 0:
pd.Series(article_detail).to_csv('..\book\'+str(index))
index=index+1