zoukankan      html  css  js  c++  java
  • 爬取四大名著

    '''
     诗词名句网
     1. 爬取固定书籍
     2. 爬取书名
     3. 爬取本部书的章回目录
     4. 灵活处理,爬取任意书籍的章回目录
     5. 加入异常处理
     6. 爬取任意整本书
    '''
    
    import requests
    import re
    
    def bookSpider(oldurl,bookName):
        url=oldurl+".html"
        html=loadPage(url)
        try:
            with open("demo.txt",'w',encoding='utf-8') as f:
                f.write(html)
        except:
            print("FILE OPERATION ERROR")
        findTitle("demo.txt",bookName)
        cnt=findTileOfPages("demo.txt",bookName)
        getWholeBook(oldurl,bookName,cnt)
    
    def findTitle(filename,bookName):
        try:
            f=open(filename,encoding='utf-8')
            book=open("book.txt",'w',encoding='utf-8')
        except:
            print("FILE OPERATION ERROR")
        while True:
            line=f.readline()
            #print("READ:"+line)
            if not line:
                break
            pattern=re.compile(r'<title>《.{0,10}》')
            bookName=re.search(pattern,line)
            flag=False
            if bookName:
                print("书名:",end="")
                for ch in str(bookName):
                    if ch == '':
                        flag=True
                    if ch == '':
                        flag=False
                        print("")
                        book.write(''+'
    ')
                    if flag:
                        print(ch,end="")
                        book.write(ch)
    
    def findTileOfPages(filename,bookName):
        cnt=0
        try:
            f = open(filename,encoding='utf-8')
            book = open("book.txt",'a', encoding='utf-8')
        except:
            print("FILE OPERATION ERROR")
        book.write("目录:
    ")
        while True:
            line = f.readline()
            # print("READ:"+line)
            if not line:
                break
            pattern = re.compile(r'<li><a href="/book/'+bookName+'/d+.html">.{10,40}</a></li>')
            titleOfpages = pattern.findall(line)
            flag = False
            if titleOfpages:
                for i in range(0,len(titleOfpages)):
                    cnt+=1
                    for j in range(0,len(titleOfpages[i])):
                        if titleOfpages[i][j] == '':
                            flag=True
                        if titleOfpages[i][j] == '<':
                            flag=False
                        if flag:
                            print(titleOfpages[i][j],end="")
                            book.write(titleOfpages[i][j])
                    print()
                    book.write('
    ')
        return cnt
    
    def getWholeBook(url,bookName,cnt):
        print("正在下载全本书,请稍后...")
        for i in range(1,cnt+1):
            newUrl=url+'/'+str(i)+".html"
            print(newUrl)
            html=loadPage(newUrl)
            try:
                with open("bookHtml.txt", 'w', encoding='utf-8') as f:
                    f.write(html)
            except:
                print("FILE OPERATION ERROR")
            f = open('bookHtml.txt', 'r', encoding='utf-8')
            bookContent = open('book.txt', 'a', encoding='utf-8')
            while True:
                line = f.readline()
                # print("READ:"+line)
                if not line:
                    break
                pattern = re.compile(r'<p>&nbsp;&nbsp;&nbsp;&nbsp;.+</p>')
                content = re.findall(pattern, line)
                patternOfTitle=re.compile(r'<h1>.+</h1>')
                contentOfTitle = re.findall(patternOfTitle, line)
                flag=False
                for i in range(0, len(contentOfTitle)):
                    for j in range(0, len(contentOfTitle[i])):
                        if contentOfTitle[i][j] == '>':
                            flag=True
                            continue
                        if contentOfTitle[i][j] == '<':
                            flag=False
                            continue
                        if flag:
                            bookContent.write(contentOfTitle[i][j])
                    bookContent.write('
    ')
    
                flag = False
                for i in range(0, len(content)):
                    for j in range(0, len(content[i])):
                        if content[i][j] == '<':
                            flag=False
                            continue
                        if content[i][j] == ';' and content[i][j - 1] == 'p' and content[i][j + 1] != '&':
                            flag = True
                            continue
                        if flag:
                            bookContent.write(content[i][j])
                    bookContent.write('
    ')
            f.close()
            bookContent.close()
    
    def loadPage(url):
        try:
            header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
            response = requests.get(url, headers=header)
            return response.content.decode('utf-8')
        except:
            print("PAGE LOAD ERROR")
    
    if __name__ == "__main__":
        bookName=input("请输入想看的书名:(全拼)")
        url = "http://www.shicimingju.com/book/"+bookName
        bookSpider(url,bookName)
  • 相关阅读:
    秒杀多线程第八篇 经典线程同步 信号量Semaphore
    SURF特征
    (最短路径算法整理)
    中国大推力矢量发动机WS15 跨入 世界先进水平!
    SQL Profile 总结(一)
    Spring下@ResponseBody响应中文内容乱码问题
    Ubuntu12.04下jamvm1.5.4+classpath-0.98成功执行 helloworld.class
    【2012.1.24更新】不要再在网上搜索eclipse的汉化包了!
    [数据结构] N皇后问题
    DG之主库、备库切换(物理备库)
  • 原文地址:https://www.cnblogs.com/TheSilverMoon/p/11143203.html
Copyright © 2011-2022 走看看