zoukankan      html  css  js  c++  java
  • 爬取漫画DB上的JoJo的奇妙冒险 第七部 飙马野郎

    SBR是JOJO系列我最喜欢的一部,所以今天把漫画爬取到本地,日后慢慢看。

    import re
    import time
    import requests
    from requests import codes
    from bs4 import BeautifulSoup
    from requests import RequestException
    
    def get_page(url):
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36'
                       + '(KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    def get_pagesNumber(text):
        soup = BeautifulSoup(text, 'lxml')
        pagesNumber = soup.find(name='div', class_="d-none vg-r-data")
        return pagesNumber.attrs['data-total'] 
        
    def parse_page(text):
        soup = BeautifulSoup(text, 'lxml')
        url = soup.find(name='img', class_="img-fluid show-pic")
        chapter = soup.find(name='h2', class_="h4 text-center")
        page = soup.find(name='span', class_="c_nav_page")
        yield {
            'url': url['src'],
            'chapter': chapter.get_text(),
            'page': page.get_text()
        }
    #return 在返回结果后 结束函数的运行
    #而yield 则是让函数变成一个生成器,生成器每次产生一个值,函数被冻结,被唤醒后再产生一个值
        
        
    def save_image(item):
        img_path = 'SBR' + os.path.sep + item.get('chapter') #os.path.sep是路径分隔符
        if not os.path.exists(img_path):
            os.makedirs(img_path)
        try:
            resp = requests.get(item.get('url'))
            if codes.ok == resp.status_code:
                file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(
                    file_name=item.get('page'), file_suffix='jpg')
                if not os.path.exists(file_path):
                    with open(file_path, 'wb') as f:
                        f.write(resp.content)
                    print('Downloaded image path is %s' % file_path)
                else:
                    print('Already Downloaded', file_path)
        except Exception as e:
            print(e)
    
    if __name__ == '__main__':
        for chapter in range(292, 316): #观察可发现共24章节,292到315 彩漫13283, 13306
            url = 'https://www.manhuadb.com/manhua/147/4_'+str(chapter)+'.html'
            text = get_page(url) 
            pagesNumber = get_pagesNumber(text) #获取当前章节总页数
            for page in range(1,int(pagesNumber)+1):
                url = 'https://www.manhuadb.com/manhua/147/4_'+str(chapter)+'_'+str(page)+'.html'
           #彩漫#url = 'https://www.manhuadb.com/manhua/147/1330_'+str(chapter)+'_'+str(page)+'.html'
                text = get_page(url)
                for item in parse_page(text):
                    save_image(item)

     最后得到,

  • 相关阅读:
    [导入]习惯修改别人的程序吗?
    [导入]感悟一首:"原来你也在这里"
    [导入]人生的高度
    [导入]nslookup工具的使用方法(转)
    [导入]回忆ASP!
    [导入]论坛的修改完成
    [导入]还原精灵安装失败!
    [导入]IE6无提示关闭窗口,不是利用activeX
    加密算法
    澄清VB调用API时字符串参数的困惑
  • 原文地址:https://www.cnblogs.com/oeong/p/11768470.html
Copyright © 2011-2022 走看看