zoukankan      html  css  js  c++  java
  • Python爬虫-《神雕侠侣》

    Python3.5

    爬取《神雕侠侣》http://www.kanunu8.com/wuxia/201102/1610.html

    武侠迷,所以喜欢爬取武侠小说

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    
    from selenium import webdriver
    import os
    from docx import Document
    import re
    
    class House():
    
        def __init__(self):
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
            self.baseUrl = 'http://www.kanunu8.com/wuxia/201102/1610.html'
            self.basePath = os.path.dirname(__file__)
    
        def makedir(self, name):
            path = os.path.join(self.basePath, name)
            isExist = os.path.exists(path)
            if not isExist:
                os.makedirs(path)
                print('File has been created.')
            else:
                print('The file is existed.')
            #切换到该目录下
            os.chdir(path)
    
        def connect(self, url):
            try:
                driver = webdriver.PhantomJS()
                driver.get(url)
                return driver
            except:
                print('This page is not existed.')
    
        #爬取每个板块中每一章节的链接地址
        def getBookLinkList(self, url):
            driver = self.connect(url)
            bookLinkList = []
            try:
                #找到所有href链接
                bookLinks = driver.find_elements_by_xpath("//a")
                for link in bookLinks:
                    temp = link.get_attribute('href')
                    print(temp)
                    try:
                        #通过正则表达式筛选出各章节的链接
                        pattern = re.compile(".+/[0-9]{5}.html$")
                        if pattern.match(temp):
                            print('ok')
                            bookLinkList.append(link.get_attribute('href'))
                    except:
                        print('little error')
            except:
                print('Error')
    
            return bookLinkList
    
        #爬取每本书的细节数据
        def getBookDetail(self, url):
            driver = self.connect(url)
            try:
                #找到标题和文章内容
                title = driver.find_element_by_xpath('//h2').text
                content = driver.find_element_by_xpath('//p').text
                print(title)
                print(content)
            except:
                print('Error.')
            return title, content
    
        def getData(self):
            doc = Document()
            self.makedir('StoryFiles')
            bookLinkList = self.getBookLinkList(self.baseUrl)
            for linkUrl in bookLinkList:
                doc.add_paragraph(self.getBookDetail(linkUrl))
    
            doc.save('神雕侠侣.docx')
    
    if __name__ == '__main__':
        house = House()
        house.getData()
  • 相关阅读:
    爱情的诗·1~5节
    人生的诗·381~385节
    人生的诗·375~380节
    python数据类型初始1
    python编码知识初始_ASCII码,Unicode,Utf-8,GBK
    Python运算符,逻辑运算
    python格式化输出%,while else
    pycharm使用教程链接+部分练习题01
    python流程控制-条件语句If,while循环
    python基础数据类型初始,用户交互
  • 原文地址:https://www.cnblogs.com/fredkeke/p/7761100.html
Copyright © 2011-2022 走看看