zoukankan      html  css  js  c++  java
  • python 使用selenium爬取进击的巨人漫画

      1 import requests
      2 from bs4 import BeautifulSoup
      3 import os
      4 from selenium import webdriver
      5 from selenium.webdriver.firefox.webdriver import WebDriver
      6 from selenium.webdriver.support.wait import WebDriverWait
      7 from selenium.webdriver.support import expected_conditions as EC
      8 from selenium.webdriver.common.by import By
      9 class Down_Cartoon():
     10     def __init__(self):
     11         self.content_url='https://www.manhuabei.com/manhua/jinjidejuren/'
     12         self.base_url='https://www.manhuabei.com'
     13         self.header={"Use-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" }
     14         self.html_path=r'd:进击的巨人.txt'
     15         self.file_path=r'D:OneDrive漫画进击的巨人'
     16         
     17     def get_url(self,url):
     18         '''
     19         通用url请求
     20         '''
     21         r=requests.get(url,headers=self.header)
     22         if r.status_code==200:
     23             return r.text
     24         else:
     25             return ""           
     26 
     27     def parse_html(self,html_content):
     28         '''
     29         BeautifulSoup解析网页
     30         返回每个章节名称列表和每个章节首页列表
     31         '''
     32         soup=BeautifulSoup(html_content,'lxml')
     33         #self.save_webxml(self.html_path,soup.prettify())
     34         main=soup.find('ul',class_="list_con_li autoHeight")
     35         content=main.find_all('a')
     36         print("总章节:",len(content))
     37         chapter_url=[]
     38         title_name=[]
     39         for p in content:
     40             title_name.append(p['title'])
     41             chapter_url.append(p['href'])
     42         return chapter_url,title_name
     43 
     44 
     45     def save_webxml(self,file_path, xml_content):
     46         '''
     47         保存html至本地
     48         '''
     49         with open(file_path,'w',encoding='UTF-8',errors='ignore') as write_blog:
     50             write_blog.write(xml_content)
     51 
     52     def download_one_page(self,href,dir_path,num):
     53         '''
     54         下载一个图片并保存
     55         '''
     56         strpic=str(num+1)+'.jpg'
     57         full_path=os.path.join(dir_path,strpic)
     58         if not os.path.exists(full_path):
     59             try:
     60                 r=requests.get(href,headers=self.header)
     61                 if r.status_code==200:
     62                      with open(full_path,'wb') as img:
     63                         img.write(r.content)
     64                         print(strpic,"success")                 
     65                 else:
     66                     print(full_path,'下载失败',href)
     67             except:
     68                 print('下载失败',href)
     69         else:
     70             print(strpic,'图片已存在,无需下载')  
     71 
     72     def mkdir(self,own_dir_name):
     73         '''创建文件夹'''
     74         own_dir_name=own_dir_name.strip()
     75         full_path= os.path.join(self.file_path,own_dir_name)
     76         isExists=os.path.exists(full_path)
     77         if not isExists:
     78             #print("创建",own_dir_name,"文件夹")
     79             os.makedirs(full_path)
     80             os.chdir(full_path)
     81             return full_path
     82         else:
     83             #print(own_dir_name,'文件夹已存在')
     84             return full_path
     85 
     86     def run(self):
     87         content_list,title_list= self.parse_html(self.get_url(self.content_url))
     88         brower=webdriver.Chrome()
     89         self.download_content(brower,content_list,title_list)
     90         brower.quit()                
     91 
     92     def download_content(self,browerdrive,content_list,title_list):
     93         '''
     94         下载漫画
     95         '''
     96         cartoon_href_list=[]
     97         for i,title in enumerate(title_list):
     98             chapter_name=title.split(" ")[0]
     99             print("正在下载第%s,总共%s话"%(chapter_name,len(title_list)))
    100             dir_path=self.mkdir(chapter_name)
    101             full_url=self.base_url+content_list[i]
    102             browerdrive.get(full_url)
    103             img_url_list=[]
    104             chapter_info={}
    105             try:
    106                 img_info= browerdrive.find_element_by_class_name("img_info")
    107             except:
    108                 print("爬取失败!")
    109                 continue
    110             tag_string=img_info.text
    111             try:
    112                 init_page=browerdrive.find_element_by_css_selector("img[style='display: inline;']").get_attribute('src')
    113             except:
    114                 print("爬取失败!")
    115                 continue         
    116             img_url_list.append(init_page)
    117             num=int(tag_string.split('/')[1][0:2])
    118             print("dir_path:",dir_path)
    119             #print(num+1,len(os.listdir(dir_path)))
    120             if num+1==len(os.listdir(dir_path)):
    121                 print("第%s已下载"%(chapter_name))
    122                 continue
    123             self.download_one_page(init_page,dir_path,0)
    124             chapter_href=self.download_chapter(browerdrive,dir_path,num)
    125             img_url_list.extend(chapter_href)
    126             chapter_info['href']=img_url_list
    127             chapter_info['chapter_name']=chapter_name
    128             cartoon_href_list.append(chapter_info)            
    129         return cartoon_href_list 
    130 
    131     def download_chapter(self,browerdrive,dir_path,max_num):
    132         '''
    133         下载一章节
    134         '''
    135         img_url=[]
    136         for x in range(0,max_num):
    137             browerdrive.find_element_by_class_name("img_land_next").click()
    138             wait=WebDriverWait(browerdrive,10)
    139             try:
    140                 wait_element=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"img[style='display: inline;']")))
    141                 #href=browerdrive.find_element_by_css_selector("img[style='display: inline;']").get_attribute('src')
    142                 href=wait_element.get_attribute('src')
    143                 print("准备下载图片:",x+2)
    144                 self.download_one_page(href,dir_path,x+1)
    145                 img_url.append(href)
    146             except:
    147                 print("wait失败!")
    148                 continue
    149                         
    150         return img_url       
    151 
    152 if  __name__=='__main__':
    153     down_load=Down_Cartoon() 
    154     down_load.run()  
    155     
  • 相关阅读:
    AutoFac中常用方法说明
    DI之循环依赖
    NB/T 10215-2019 风力发电机组 测风传感器等最新能源行业标准
    DL/T 691-2019等最新电力行业标准
    TSG D7006-2020 压力管道监督检验规则
    YY/T 0595-2020 医疗器械 质量管理体系YY/T 0287-2017 应用指南
    GB/T 38775-2020系列电动汽车无线充电系统标准
    最新电动汽车安全标准
    GB 38032-2020 电动客车安全要求
    GB 38031-2020 电动汽车用动力蓄电池安全要求
  • 原文地址:https://www.cnblogs.com/mariolz/p/13693523.html
Copyright © 2011-2022 走看看