zoukankan      html  css  js  c++  java
  • 1kkk

    给基友下载漫画看

    代码:

      1 # !usr/bin/python3.4
      2 # -*- coding:utf-8 -*-
      3 
      4 import requests
      5 import os
      6 import time
      7 import re
      8 from lxml import etree
      9 import random
     10 
     11 def geturl(url,postdata):
     12     header = {'User-Agent':
     13                   'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
     14               'Referer':'http://m.1kkk.com/vol1-6871/',
     15               'Host': 'manhua1023.61-174-50-131.cdndm5.com',
     16               'Accept': 'image/png,image/*;q=0.8,*/*;q=0.5',
     17               'Accept-Encoding': 'gzip, deflate',
     18               'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
     19               'Connection': 'keep-alive',
     20               }
     21 
     22     s = requests.Session()
     23     r = s.post('http://m.1kkk.com/userdata.ashx',data = postdata)
     24     _cookies = r.cookies
     25     #print(r.content)
     26     rs = s.get(url, headers=header,cookies = _cookies)
     27     return rs
     28 
     29 
     30 def get(url):
     31     header = {'User-Agent':
     32                   'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0',
     33               'Referer': 'http://www.1kkk.com/manhua589/',
     34               'Host': 'www.1kkk.com'}
     35 
     36     # 解析网页
     37     html_bytes = requests.get(url, headers=header)
     38 
     39     return html_bytes
     40 
     41 def mget(url):
     42     header = {'User-Agent':
     43                   'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0',
     44               'Referer': 'http://m.1kkk.com/manhua589/',
     45               'Host': 'm.1kkk.com'}
     46 
     47     # 解析网页
     48     html_bytes = requests.get(url, headers=header)
     49 
     50     return html_bytes
     51 
     52 
     53 # 去除标题中的非法字符 (Windows)
     54 def validateTitle(title):
     55     # '/:*?"<>|'
     56     rstr = r"[/\:*?"<>|]"
     57     new_title = re.sub(rstr, "", title)
     58     return new_title
     59 
     60 
     61 def prints(timesleep):
     62     print('暂停' + str(timesleep) + '秒后开始批量下载图片,请保持网络畅通...')
     63     time.sleep(timesleep)
     64 
     65 # 解析js
     66 def regnext(js):
     67     reg = r'(var.+?.split)'
     68     all = re.compile(reg);
     69     alllist = re.findall(all, js)
     70     return alllist
     71 
     72 # 递归创建文件夹
     73 def createjia(path):
     74     try:
     75         os.makedirs(path)
     76     except:
     77         print('目录已经存在:' + path)
     78 
     79 
     80 if __name__ == '__main__':
     81 
     82     html = get('http://www.1kkk.com/manhua589/').content.decode('utf-8', 'ignore')
     83 
     84     page = etree.HTML(html.lower())
     85     # 得到网址后缀
     86     hrefs = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/a/@href')
     87     # 得到编号
     88     hrefnames = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/a/text()')
     89     # 得到页数
     90     hrefpages = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/text()')
     91 
     92     href = []
     93     hrefname = []
     94     hrefpage = []
     95     number = 1
     96 
     97     # 不知道里面那几卷是不是漫画里面的
     98     # 先抓下来再说
     99     # 得到网址后缀
    100     for temp in hrefs:
    101         towurl = temp
    102         href.append(towurl)
    103     # 得到编号
    104     for temp in hrefnames:
    105         hrefname.append(temp)
    106     # 得到页数
    107     for temp in hrefpages:
    108         hrefpage.append(temp.replace("", ""))
    109 
    110     j = 0
    111     filenamep = '../data/' + str(hrefname[0]) + "/"
    112     createjia(filenamep)
    113 
    114     for i in range(0, len(href)):
    115         for j in range(len(hrefpage)):
    116 
    117             # 6871、6872。。
    118             hrefnumber = str(href[i]).replace("ch54-","").replace("/","").replace("vol1-","")
    119             #print(hrefnumber)
    120             # 构造jsurl
    121             # 得到
    122             # http://www.1kkk.com/vol1-6871/imagefun.ashx?cid=6871&page=1&key=65abd421f4aed565&maxcount=10
    123             jsurl = "http://www.1kkk.com" + str(href[i]) + "/imagefun.ashx?cid=" + str(hrefnumber) + "&page=" + str(j + 1) + "&key=65abd421f4aed565&maxcount=10"
    124             print(jsurl)
    125 
    126             # 构造image网址
    127             html = get(jsurl).content.decode('utf-8', 'ignore')
    128             html1 = regnext(html)
    129             html1 = html1[0].replace("'.split", "").split('|')
    130 
    131             # http://manhua1023.61-174-50-131.cdndm5.com/1/589/6871/102_9224.jpg?cid=6871&key=d8ce90e0b3f013f292ef77e84da88990&type=1
    132             image_1url = "http://manhua1023." + str(html1[19]) + "-" + str(html1[18]) + "-" + str(html1[9]) + "-" + str(
    133                 html1[10]) + ".cdndm5.com/1/589/" + str(href[i]) + "/" + str(html1[20]) + "?cid=" + str(6871) + "&key=" + str(
    134                 html1[8]) + "&type=1"
    135             print(image_1url)
    136 
    137             # 构造image网址
    138             filess = open(filenamep + str(j + 1) + '.jpg', 'wb')
    139 
    140             # 伪装posrdata
    141             postdata = {
    142                 'cid': 6871,
    143                 'language': 1,
    144                 'mid': 589,
    145                 'page': j + 1,
    146                 'tp': 8,
    147                 'uid': 0
    148             }
    149 
    150             # 即使正确的网址也是不能下载
    151             pic = geturl(image_1url,postdata)
    152             filess.write(pic.content)
    153             filess.close()
    154             print('已经写入第' + str(j + 1) + '张图片')
    155             j = j + 1
    156 
    157             # 每一次下载都暂停1-3秒
    158             loadimg = random.randint(1, 3)
    159             print('暂停' + str(loadimg) + '')
    160             time.sleep(loadimg)

    selenium抓取:

     1 #!/usr/bin/python3.4
     2 # -*- coding: utf-8 -*-
     3 
     4 from selenium import webdriver
     5 import time
     6 from selenium.webdriver.common.keys import Keys
     7 import re
     8 
     9 
    10 # 去除标题中的非法字符 (Windows)
    11 def validateTitle(title):
    12     rstr = r"[/\:*?"<>|]"  # '/:*?"<>|'
    13     new_title = re.sub(rstr, "", title)
    14     return new_title
    15 
    16 
    17 def getimg():
    18     # http://www.cnblogs.com/fnng/p/3238685.html
    19     # 打开火狐浏览器
    20     # browser = webdriver.Chrome()
    21     browser = webdriver.Firefox()
    22 
    23     # 设置浏览器大小
    24     browser.set_window_size(1200, 900)
    25     # 输入网址
    26     browser.get("http://m.1kkk.com/vol1-6871/")
    27     # 根据各自网速来判断网址加载时间
    28     time.sleep(10)
    29 
    30     for i in range(10000):
    31 
    32         # 关掉广告
    33         browser.find_element_by_class_name("ad_cross").click()
    34 
    35         # 翻页到最后面
    36         browser.execute_script("""
    37             (function () {
    38                 var y = 0;
    39                 var step = 100;
    40                 window.scroll(0, 0);
    41 
    42                 function f() {
    43                     if (y < document.body.scrollHeight) {
    44                         y += step;
    45                         window.scroll(0, y);
    46                         setTimeout(f, 100);
    47                     } else {
    48                         window.scroll(0, 0);
    49                         document.title += "scroll-done";
    50                     }
    51                 }
    52 
    53                 setTimeout(f, 1000);
    54             })();
    55         """)
    56         print("下拉中...")
    57         #time.sleep(180)
    58         while True:
    59             if "scroll-done" in browser.title:
    60                 break
    61             else:
    62                 print("还没有拉到最底端...")
    63                 time.sleep(10)
    64 
    65         # while True:
    66         #     # 判断是否存在这个东西
    67         #     select = browser.find_element_by_xpath('//a[@class="readTipForm"]')
    68         #     if select:
    69         #         break
    70         #     else:
    71         #         print("还没有拉到最底端...")
    72         #         time.sleep(60)
    73 
    74         print("正在下载图片中...")
    75         # 图片的命名
    76         name = validateTitle(browser.current_url)
    77         print("正在截图...")
    78         time.sleep(5)
    79 
    80         # 截图
    81         browser.save_screenshot("../jpg/cartoon/" + str(i + 1) + str(name) + ".png")
    82         time.sleep(5)
    83 
    84         # 点击阅读下一章
    85         browser.find_element_by_class_name("readTipForm").click()
    86         print("准备进入下一章...")
    87         time.sleep(5)
    88 
    89     browser.quit()
    90 
    91 
    92 if __name__ == '__main__':
    93     getimg()
  • 相关阅读:
    机器学习——集成学习之Boosting
    机器学习——集成学习之Bagging
    Javascript获取html元素的几种方法
    JSTL 标签大全详解
    浅谈web应用的负载均衡、集群、高可用(HA)解决方案
    Spring中ApplicationContext加载机制和配置初始化
    Hibernate注解详解(超全面不解释)
    hibernate注解主键生成策略
    Java中Filter、Servlet、Listener的学习
    注解 @Resource与@Autowired与@Component的使用
  • 原文地址:https://www.cnblogs.com/TTyb/p/5831601.html
Copyright © 2011-2022 走看看