zoukankan      html  css  js  c++  java
  • python爬虫-搜索小说并下载

      1 #coding:utf-8
      2 import requests,os,re
      3 from bs4 import BeautifulSoup
      4 from selenium import webdriver
      5 from selenium.webdriver.chrome.options import Options
      6 from selenium.webdriver.common.keys import Keys
      7 
      8 class downloader():
      9 
     10     def __init__(self):
     11         self.urls = []  # 保存章节链接
     12         self.name = []  # 保存章节名
     13         self.url = 'https://so.biqusoso.com/s.php?ie=utf-8&siteid=biqugex.com&q='
     14 
     15     """输入小说名,搜索"""
     16     def Get_url(self):
     17         #创建chrome参数对象,设置chrome浏览器无界面模式
     18         chrome_options = Options()
     19         chrome_options.add_argument('--headless')
     20         # 创建chrome无界面对象
     21         browser = webdriver.Chrome(options=chrome_options)
     22         browser.get(self.url)
     23         c = input('请输入小说全名:')
     24         browser.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div[2]/form/input[3]').send_keys(c)
     25         browser.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div[2]/form/input[4]').click()
     26         new_url = browser.current_url
     27         # 关闭浏览器
     28         browser.close()
     29         # 关闭chromedriver进程
     30         browser.quit()
     31         print("已关闭浏览器")
     32         # print(new_url)
     33         response = requests.get(new_url)
     34         response.encoding = 'utf-8'
     35         soup = BeautifulSoup(response.text, 'lxml')
     36         # print(soup)
     37         name1 = soup.find_all('span', class_='s2')
     38         soup = BeautifulSoup(str(name1), 'lxml')
     39         new_name = soup.find('a')
     40         new_name1 = new_name.string
     41         # print(new_name1)
     42         self.href = new_name.attrs['href']
     43         print(self.href)
     44         return self.href
     45     def Response(self):
     46         response = requests.get(self.href)
     47         response.encoding = 'gbk'  # 解决乱码
     48         self.soup = BeautifulSoup(response.text, 'lxml')  # 解析网页
     49         div = self.soup.find_all('div', class_='listmain')  # 在解析结果中查找class_='listmain'
     50         soup1 = BeautifulSoup(str(div), 'lxml')  # 删除字符串头和尾的空格
     51         h = soup1.find_all('a')  # 在class_='listmain下面找到a标签
     52         for i in h:
     53             self.name.append(i.string)  # 将a标签中的非属性字符,即章节名添加到name
     54             self.urls.append('https://www.biqugex.com%s' % i.get('href'))  # 将a标签中的链接,添加到urls
     55 
     56     def file(self):
     57         """查找小说名字,并创建同名文件夹"""
     58         div1 = self.soup.select('body > div.book > div.info > h2')
     59         a = BeautifulSoup(str(div1), 'lxml')
     60         b = a.find('h2')
     61         b = b.string
     62         c = 'C:\Users\Administrator\Desktop\%s' % b
     63         if not os.path.exists(c):
     64             os.mkdir(c)
     65 
     66         # 循环解析urls,得到小说正文
     67         i = 0
     68         while i < len(self.urls):
     69             response1 = requests.get(url=self.urls[i])
     70             response1.encoding = 'gbk'
     71             soup2 = BeautifulSoup(response1.text, 'lxml')
     72             d = soup2.find_all('div', id='content')
     73             id1 = BeautifulSoup(str(d), 'lxml')
     74             # 创建文件名
     75             src = self.name[i] + '.txt'
     76             filename = c + '/' + src
     77             print(filename)
     78 
     79             # 将解析到的小说正文写到文件中
     80             for result in id1:
     81                 res = result.text
     82                 id2 = soup2.select('#content')
     83                 with open(filename, 'w+', encoding='utf-8') as f:
     84                     f.write(res)
     85                 i += 1
     86 #如果输入的网址不是正确的网址,则提示请输入正确的笔趣阁网址
     87     def Main(self):
     88         try:
     89             d = downloader()
     90             d.Get_url()
     91         except:
     92             print('没有找到')
     93         else:
     94             d.Response()
     95             d.file()
     96 
     97 
     98 
     99 if __name__ == '__main__':
    100     # url=input('请输入网址:')
    101     # url='https://www.biqugex.com/book_104027/'
    102     a = downloader()
    103     a.Main()
  • 相关阅读:
    也谈谈关于WEB的感想
    spring boot,https,双向ssl认证
    Spring Cloud Gateway(二)
    Spring Cloud Gateway(一)
    .Net Web Service 自定义返回值命名
    随便记一下,C#并行环境操作Winform的代码段
    随便记一下,解决Windows Server 2012无法远程登录的方法
    记录C#控件DataGridView绑定BindingList无法排序问题(转)
    记录SQL Server 2019链接Oracle 11g R2的过程
    Json CPP 中文支持与入门示例
  • 原文地址:https://www.cnblogs.com/hfct/p/11661063.html
Copyright © 2011-2022 走看看