zoukankan      html  css  js  c++  java
  • 爬虫练习-爬取小说

    # 程序启动文件    start.py
    #
    !/usr/bin/python # -*- coding: utf-8 -*- import os, sys BASEPATH = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) print(BASEPATH) sys.path.append(BASEPATH) from core import SpiderMan if __name__ == '__main__': s=SpiderMan.SpiderMan() s.async()
    # 爬虫调度器
    #
    !/usr/bin/python # -*- coding: utf-8 -*- from gevent import spawn,monkey,joinall;monkey.patch_all() from concurrent.futures import ThreadPoolExecutor from core.UrlManager import UrlManager from core.Htmldown import Htmldown from core.Htmlparser import Htmlparser # from core.DataOutput import DataOutput class SpiderMan: def __init__(self): self.manager=UrlManager() #url管理 self.downloader=Htmldown() #HTML下载 self.parser=Htmlparser() #HTML解析 # self.output=DataOutput() def index_work(self): ''' 爬取凌霄主页 ''' url = 'http://www.lingxiaozhishang.com' self.manager.oldurls.add(url) #列表中添加每次传进来的url html_dict = self.downloader.down_page(url) #下载器下载 if html_dict is None: # raise print("爬取主页出错了") print("爬取主页出错了") return None new_urls = self.parser.parser_index(html_dict,url) # 解析二层链接 self.manager.add_urls(new_urls) # 所有的a标签存放的列表 print("爬取 主页 + 所有文章url 完成") def async(self): ''' 开启协程 ''' self.index_work() pool = ThreadPoolExecutor(10) # 开启十个线程池 while True: url = self.manager.get_url() # 从url管理器中获取url if url is None: break pool.submit(self.downloader.down_page,url).add_done_callback(self.parser.parser_page) # 提交下载任务,解析 pool.shutdown(wait=True) #最后得关闭线程池 print("完了-----------------------")
    # URL管理器
    #
    !/usr/bin/python # -*- coding: utf-8 -*- class UrlManager: def __init__(self): self.newurls=set() self.oldurls=set() def add_url(self,newurl): ''' 添加小说章节的url :return: ''' if newurl not in self.oldurls: self.newurls.add(newurl) def add_urls(self,newurls): ''' 添加多个小说章节的url :param newurls: :return: ''' if len(newurls)==0:return for url in newurls: self.add_url(url) def get_url(self): ''' 取出一个小说章节的url :return: ''' try: url = self.newurls.pop() if url is not None: self.oldurls.add(url) return url except KeyError: pass def has_oldurls(self): ''' 返回已爬小说章节的数量 :return: ''' return len(self.oldurls)
    # HTML下载器
    #
    !/usr/bin/python # -*- coding: utf-8 -*- import requests class Htmldown: def down_page(self,url): ''' 下载网页内容 ''' headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0'} r=requests.get(url,headers=headers) r.encoding='utf8' if r.status_code==200: return r.text
    # HTML解析器      解析完直接存储到文件了,应该持久化到MongoDB中
    #
    !/usr/bin/python # -*- coding: utf-8 -*- from bs4 import BeautifulSoup class Htmlparser: def parser_index(self,html_conf,url): soup = BeautifulSoup(html_conf, 'html.parser') list_a = soup.find(class_="chapterlist").find_all('a') new_urls=[] for a in list_a: #url=http://www.lingxiaozhishang.com #/book/439.html new_url ="%s%s"%(url,a.attrs["href"]) new_urls.append(new_url) return new_urls def parser_page(self,html_conf): ''' 解析小说章节页面 :param html_conf: :return: ''' html_conf =html_conf.result() soup=BeautifulSoup(html_conf,'html.parser') title = soup.find('h1').get_text() text = soup.find(id="BookText").get_text() filepath = r"C:UsersAdministratorDesktopArticledb\%s.txt"%title with open(filepath,"w") as f: f.write(text) print("%s 下载完成"%title)
  • 相关阅读:
    hdu 4521 小明系列问题——小明序列(线段树 or DP)
    hdu 1115 Lifting the Stone
    hdu 5476 Explore Track of Point(2015上海网络赛)
    Codeforces 527C Glass Carving
    hdu 4414 Finding crosses
    LA 5135 Mining Your Own Business
    uva 11324 The Largest Clique
    hdu 4288 Coder
    PowerShell随笔3 ---别名
    PowerShell随笔2---初始命令
  • 原文地址:https://www.cnblogs.com/52-qq/p/8343014.html
Copyright © 2011-2022 走看看