zoukankan      html  css  js  c++  java
  • 爬虫练习-爬取小说

    # 程序启动文件    start.py
    #
    !/usr/bin/python # -*- coding: utf-8 -*- import os, sys BASEPATH = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) print(BASEPATH) sys.path.append(BASEPATH) from core import SpiderMan if __name__ == '__main__': s=SpiderMan.SpiderMan() s.async()
    # 爬虫调度器
    #
    !/usr/bin/python # -*- coding: utf-8 -*- from gevent import spawn,monkey,joinall;monkey.patch_all() from concurrent.futures import ThreadPoolExecutor from core.UrlManager import UrlManager from core.Htmldown import Htmldown from core.Htmlparser import Htmlparser # from core.DataOutput import DataOutput class SpiderMan: def __init__(self): self.manager=UrlManager() #url管理 self.downloader=Htmldown() #HTML下载 self.parser=Htmlparser() #HTML解析 # self.output=DataOutput() def index_work(self): ''' 爬取凌霄主页 ''' url = 'http://www.lingxiaozhishang.com' self.manager.oldurls.add(url) #列表中添加每次传进来的url html_dict = self.downloader.down_page(url) #下载器下载 if html_dict is None: # raise print("爬取主页出错了") print("爬取主页出错了") return None new_urls = self.parser.parser_index(html_dict,url) # 解析二层链接 self.manager.add_urls(new_urls) # 所有的a标签存放的列表 print("爬取 主页 + 所有文章url 完成") def async(self): ''' 开启协程 ''' self.index_work() pool = ThreadPoolExecutor(10) # 开启十个线程池 while True: url = self.manager.get_url() # 从url管理器中获取url if url is None: break pool.submit(self.downloader.down_page,url).add_done_callback(self.parser.parser_page) # 提交下载任务,解析 pool.shutdown(wait=True) #最后得关闭线程池 print("完了-----------------------")
    # URL管理器
    #
    !/usr/bin/python # -*- coding: utf-8 -*- class UrlManager: def __init__(self): self.newurls=set() self.oldurls=set() def add_url(self,newurl): ''' 添加小说章节的url :return: ''' if newurl not in self.oldurls: self.newurls.add(newurl) def add_urls(self,newurls): ''' 添加多个小说章节的url :param newurls: :return: ''' if len(newurls)==0:return for url in newurls: self.add_url(url) def get_url(self): ''' 取出一个小说章节的url :return: ''' try: url = self.newurls.pop() if url is not None: self.oldurls.add(url) return url except KeyError: pass def has_oldurls(self): ''' 返回已爬小说章节的数量 :return: ''' return len(self.oldurls)
    # HTML下载器
    #
    !/usr/bin/python # -*- coding: utf-8 -*- import requests class Htmldown: def down_page(self,url): ''' 下载网页内容 ''' headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0'} r=requests.get(url,headers=headers) r.encoding='utf8' if r.status_code==200: return r.text
    # HTML解析器      解析完直接存储到文件了,应该持久化到MongoDB中
    #
    !/usr/bin/python # -*- coding: utf-8 -*- from bs4 import BeautifulSoup class Htmlparser: def parser_index(self,html_conf,url): soup = BeautifulSoup(html_conf, 'html.parser') list_a = soup.find(class_="chapterlist").find_all('a') new_urls=[] for a in list_a: #url=http://www.lingxiaozhishang.com #/book/439.html new_url ="%s%s"%(url,a.attrs["href"]) new_urls.append(new_url) return new_urls def parser_page(self,html_conf): ''' 解析小说章节页面 :param html_conf: :return: ''' html_conf =html_conf.result() soup=BeautifulSoup(html_conf,'html.parser') title = soup.find('h1').get_text() text = soup.find(id="BookText").get_text() filepath = r"C:UsersAdministratorDesktopArticledb\%s.txt"%title with open(filepath,"w") as f: f.write(text) print("%s 下载完成"%title)
  • 相关阅读:
    VS 2008潜在强大的功能:提取EXE文件中的ICO等资源
    园友们注意:淘宝网上QQ会员 4钻 3元 等都为骗子行为
    Comet Async Process Request Handler
    WCF(Sender) to MSMQ to WCF(Receiver)
    ASP.NET Web Form GridView DetailsView Query Edit
    WCF NetTcp AsyncQueue Service
    Xml CDATA 序列化
    Sync Invoke Remoting Async Invoke
    .Net 4.0 Remoting ConcurrentQueue
    Socket Async Receive Data to LinkedList Buffer (telnet proxy server)
  • 原文地址:https://www.cnblogs.com/52-qq/p/8343014.html
Copyright © 2011-2022 走看看