zoukankan      html  css  js  c++  java
  • python爬取哦漫画

      1 import requests
      2 from lxml import etree
      3 from bs4 import BeautifulSoup
      4 import os
      5 from selenium import webdriver
      6 
      7 
      8 
      9 #解析每个漫画分页并下载漫画
     10 def manhua(url):
     11 
     12 
     13     browser.get(url)
     14 
     15     #获取模拟访问的页面源码
     16     html=browser.page_source
     17 
     18 
     19     html = etree.HTML(html)
     20     img_url = html.xpath('//img[@id="mangaFile"]/@src')[0]
     21     alt = html.xpath('/html/body/div[2]/div[2]/h1/a/text()')[0]
     22     title = html.xpath('/html/body/div[2]/div[2]/h2/text()')[0]
     23     print(img_url,alt,title)
     24 
     25 
     26     # print(html)
     27 
     28 
     29     path='./漫画/'+alt+'/'+title+'/'
     30     if not os.path.exists(path):
     31         os.makedirs(path)
     32     fname=img_url.split('/')[-1]
     33     # print(fname)
     34 
     35 
     36     print(os.path.join(path,fname))
     37 
     38     # request.urlretrieve(img_url,os.path.join(path,fname))
     39 
     40     #请求图片地址
     41     response = requests.get(img_url)
     42     #二进制解码
     43     data= response.content
     44     #保存文件
     45     with open(path+fname,'wb') as f:
     46         f.write(data)
     47 #解析获取漫画分页链接
     48 def manhua_url(url):
     49     response = requests.get(url)
     50     response.encoding = response.apparent_encoding
     51     html = response.text
     52     html = etree.HTML(html)
     53     # print(html)
     54     #i为漫画页数
     55     i = html.xpath('/html/body/div[2]/div[2]/span/text()')[1][1:-1]
     56     i=int(i)
     57     # print(i)
     58     #找到分页规律
     59     #拼接分页链接,选择用format函数
     60     url = url +'/index.html?p={}'
     61     # print(url)
     62     for n in range(1,i+1):
     63         fullurl = url.format(n)
     64         print(fullurl)
     65         # time.sleep(2)
     66         #fullurl为所有的分页漫画链接
     67         manhua(fullurl)
     68 
     69 #解析列表页
     70 def list(lb_url):
     71     response = requests.get(lb_url)
     72     response.encoding = response.apparent_encoding
     73     html = response.text
     74     html = BeautifulSoup(html,'lxml')
     75     #匹配所有章节链接
     76     url_list = html.select('div.subBookList ul li')
     77     for url in url_list :
     78         url = url.select('a')[0].get('href').split('/')[-2]
     79 
     80         # print(url)
     81         fullurl = os.path.join(lb_url,url)
     82         print(fullurl)
     83         #章节链接
     84         manhua_url(fullurl)
     85 
     86     # print(url_list)
     87     # print(html)
     88 
     89 #解析首页
     90 def shouye():
     91     #首页链接
     92     base_url = 'http://www.omanhua.com/'
     93     #发起请求
     94     response = requests.get(base_url)
     95     #解码
     96     response.encoding = response.apparent_encoding
     97     #获取返回的网页
     98     html = response.text
     99     # print(html)
    100     #解析
    101     html =BeautifulSoup(html,'lxml')
    102     #匹配最热漫画链接
    103     url_list = html.select('ul#cartoon_image_show1 li')
    104     for url in url_list:
    105         # print(url)
    106         url = url.select('a')[0].get('href')[1:]
    107         # alt = url.select('a')
    108         # print(alt)
    109         #拼接链接
    110         fullurl = os.path.join(base_url,url)
    111         print(fullurl)
    112 
    113         list(fullurl)
    114 if __name__ == '__main__':
    115     # 用自动测试模块selenium模拟浏览器访问,这里用谷歌 图片加载获取不到图片链接
    116     #后面的路径是chorm驱动路径
    117     browser = webdriver.Chrome(executable_path=r'C:UserszhaozhiDesktopchromedriver.exe')
    118     shouye()

    刚开始自学爬虫不久,代码可能写的有点繁琐,希望和大家一起学习学习进步

  • 相关阅读:
    异常:Unknown lifecycle phase "mvn". You must specify a valid lifecycle
    java中数的表示
    windows7 桌面突然卡住了,点击右键点不了,点击桌面软件点不了,怎么办?
    DJango错误日志生成
    drf框架接口文档
    drf框架中分页组件
    UVa10234 Race
    洛谷P2982 [USACO10FEB]慢下来Slowing down
    UVa10820 Send a Table
    POJ1861 Network
  • 原文地址:https://www.cnblogs.com/lyxdw/p/9226583.html
Copyright © 2011-2022 走看看