zoukankan      html  css  js  c++  java
  • 爬取漫画DB上的JoJo的奇妙冒险 第七部 飙马野郎

    SBR是JOJO系列我最喜欢的一部,所以今天把漫画爬取到本地,日后慢慢看。

    import re
    import time
    import requests
    from requests import codes
    from bs4 import BeautifulSoup
    from requests import RequestException
    
    def get_page(url):
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36'
                       + '(KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    def get_pagesNumber(text):
        soup = BeautifulSoup(text, 'lxml')
        pagesNumber = soup.find(name='div', class_="d-none vg-r-data")
        return pagesNumber.attrs['data-total'] 
        
    def parse_page(text):
        soup = BeautifulSoup(text, 'lxml')
        url = soup.find(name='img', class_="img-fluid show-pic")
        chapter = soup.find(name='h2', class_="h4 text-center")
        page = soup.find(name='span', class_="c_nav_page")
        yield {
            'url': url['src'],
            'chapter': chapter.get_text(),
            'page': page.get_text()
        }
    #return 在返回结果后 结束函数的运行
    #而yield 则是让函数变成一个生成器,生成器每次产生一个值,函数被冻结,被唤醒后再产生一个值
        
        
    def save_image(item):
        img_path = 'SBR' + os.path.sep + item.get('chapter') #os.path.sep是路径分隔符
        if not os.path.exists(img_path):
            os.makedirs(img_path)
        try:
            resp = requests.get(item.get('url'))
            if codes.ok == resp.status_code:
                file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(
                    file_name=item.get('page'), file_suffix='jpg')
                if not os.path.exists(file_path):
                    with open(file_path, 'wb') as f:
                        f.write(resp.content)
                    print('Downloaded image path is %s' % file_path)
                else:
                    print('Already Downloaded', file_path)
        except Exception as e:
            print(e)
    
    if __name__ == '__main__':
        for chapter in range(292, 316): #观察可发现共24章节,292到315 彩漫13283, 13306
            url = 'https://www.manhuadb.com/manhua/147/4_'+str(chapter)+'.html'
            text = get_page(url) 
            pagesNumber = get_pagesNumber(text) #获取当前章节总页数
            for page in range(1,int(pagesNumber)+1):
                url = 'https://www.manhuadb.com/manhua/147/4_'+str(chapter)+'_'+str(page)+'.html'
           #彩漫#url = 'https://www.manhuadb.com/manhua/147/1330_'+str(chapter)+'_'+str(page)+'.html'
                text = get_page(url)
                for item in parse_page(text):
                    save_image(item)

     最后得到,

  • 相关阅读:
    To select the file to upload we can use the standard HTML input control of type
    Cascading Menu Script using Javascript Explained
    网站首页head区代码规范
    轻松掌握 Java 泛型
    JDK 5.0 中的泛型类型学习
    如何在firefox下获取下列框选中option的text
    是同步方法还是 synchronized 代码? 详解多线程同步规则
    javascript select option对象总结
    Select的动态取值(Text,value),添加,删除。兼容IE,FireFox
    javascript在ie和firefox下的一些差异
  • 原文地址:https://www.cnblogs.com/oeong/p/11768470.html
Copyright © 2011-2022 走看看