zoukankan      html  css  js  c++  java
  • 1kkk

    给基友下载漫画看

    代码:

      1 # !usr/bin/python3.4
      2 # -*- coding:utf-8 -*-
      3 
      4 import requests
      5 import os
      6 import time
      7 import re
      8 from lxml import etree
      9 import random
     10 
     11 def geturl(url,postdata):
     12     header = {'User-Agent':
     13                   'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
     14               'Referer':'http://m.1kkk.com/vol1-6871/',
     15               'Host': 'manhua1023.61-174-50-131.cdndm5.com',
     16               'Accept': 'image/png,image/*;q=0.8,*/*;q=0.5',
     17               'Accept-Encoding': 'gzip, deflate',
     18               'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
     19               'Connection': 'keep-alive',
     20               }
     21 
     22     s = requests.Session()
     23     r = s.post('http://m.1kkk.com/userdata.ashx',data = postdata)
     24     _cookies = r.cookies
     25     #print(r.content)
     26     rs = s.get(url, headers=header,cookies = _cookies)
     27     return rs
     28 
     29 
     30 def get(url):
     31     header = {'User-Agent':
     32                   'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0',
     33               'Referer': 'http://www.1kkk.com/manhua589/',
     34               'Host': 'www.1kkk.com'}
     35 
     36     # 解析网页
     37     html_bytes = requests.get(url, headers=header)
     38 
     39     return html_bytes
     40 
     41 def mget(url):
     42     header = {'User-Agent':
     43                   'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0',
     44               'Referer': 'http://m.1kkk.com/manhua589/',
     45               'Host': 'm.1kkk.com'}
     46 
     47     # 解析网页
     48     html_bytes = requests.get(url, headers=header)
     49 
     50     return html_bytes
     51 
     52 
     53 # 去除标题中的非法字符 (Windows)
     54 def validateTitle(title):
     55     # '/:*?"<>|'
     56     rstr = r"[/\:*?"<>|]"
     57     new_title = re.sub(rstr, "", title)
     58     return new_title
     59 
     60 
     61 def prints(timesleep):
     62     print('暂停' + str(timesleep) + '秒后开始批量下载图片,请保持网络畅通...')
     63     time.sleep(timesleep)
     64 
     65 # 解析js
     66 def regnext(js):
     67     reg = r'(var.+?.split)'
     68     all = re.compile(reg);
     69     alllist = re.findall(all, js)
     70     return alllist
     71 
     72 # 递归创建文件夹
     73 def createjia(path):
     74     try:
     75         os.makedirs(path)
     76     except:
     77         print('目录已经存在:' + path)
     78 
     79 
     80 if __name__ == '__main__':
     81 
     82     html = get('http://www.1kkk.com/manhua589/').content.decode('utf-8', 'ignore')
     83 
     84     page = etree.HTML(html.lower())
     85     # 得到网址后缀
     86     hrefs = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/a/@href')
     87     # 得到编号
     88     hrefnames = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/a/text()')
     89     # 得到页数
     90     hrefpages = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/text()')
     91 
     92     href = []
     93     hrefname = []
     94     hrefpage = []
     95     number = 1
     96 
     97     # 不知道里面那几卷是不是漫画里面的
     98     # 先抓下来再说
     99     # 得到网址后缀
    100     for temp in hrefs:
    101         towurl = temp
    102         href.append(towurl)
    103     # 得到编号
    104     for temp in hrefnames:
    105         hrefname.append(temp)
    106     # 得到页数
    107     for temp in hrefpages:
    108         hrefpage.append(temp.replace("", ""))
    109 
    110     j = 0
    111     filenamep = '../data/' + str(hrefname[0]) + "/"
    112     createjia(filenamep)
    113 
    114     for i in range(0, len(href)):
    115         for j in range(len(hrefpage)):
    116 
    117             # 6871、6872。。
    118             hrefnumber = str(href[i]).replace("ch54-","").replace("/","").replace("vol1-","")
    119             #print(hrefnumber)
    120             # 构造jsurl
    121             # 得到
    122             # http://www.1kkk.com/vol1-6871/imagefun.ashx?cid=6871&page=1&key=65abd421f4aed565&maxcount=10
    123             jsurl = "http://www.1kkk.com" + str(href[i]) + "/imagefun.ashx?cid=" + str(hrefnumber) + "&page=" + str(j + 1) + "&key=65abd421f4aed565&maxcount=10"
    124             print(jsurl)
    125 
    126             # 构造image网址
    127             html = get(jsurl).content.decode('utf-8', 'ignore')
    128             html1 = regnext(html)
    129             html1 = html1[0].replace("'.split", "").split('|')
    130 
    131             # http://manhua1023.61-174-50-131.cdndm5.com/1/589/6871/102_9224.jpg?cid=6871&key=d8ce90e0b3f013f292ef77e84da88990&type=1
    132             image_1url = "http://manhua1023." + str(html1[19]) + "-" + str(html1[18]) + "-" + str(html1[9]) + "-" + str(
    133                 html1[10]) + ".cdndm5.com/1/589/" + str(href[i]) + "/" + str(html1[20]) + "?cid=" + str(6871) + "&key=" + str(
    134                 html1[8]) + "&type=1"
    135             print(image_1url)
    136 
    137             # 构造image网址
    138             filess = open(filenamep + str(j + 1) + '.jpg', 'wb')
    139 
    140             # 伪装posrdata
    141             postdata = {
    142                 'cid': 6871,
    143                 'language': 1,
    144                 'mid': 589,
    145                 'page': j + 1,
    146                 'tp': 8,
    147                 'uid': 0
    148             }
    149 
    150             # 即使正确的网址也是不能下载
    151             pic = geturl(image_1url,postdata)
    152             filess.write(pic.content)
    153             filess.close()
    154             print('已经写入第' + str(j + 1) + '张图片')
    155             j = j + 1
    156 
    157             # 每一次下载都暂停1-3秒
    158             loadimg = random.randint(1, 3)
    159             print('暂停' + str(loadimg) + '')
    160             time.sleep(loadimg)

    selenium抓取:

     1 #!/usr/bin/python3.4
     2 # -*- coding: utf-8 -*-
     3 
     4 from selenium import webdriver
     5 import time
     6 from selenium.webdriver.common.keys import Keys
     7 import re
     8 
     9 
    10 # 去除标题中的非法字符 (Windows)
    11 def validateTitle(title):
    12     rstr = r"[/\:*?"<>|]"  # '/:*?"<>|'
    13     new_title = re.sub(rstr, "", title)
    14     return new_title
    15 
    16 
    17 def getimg():
    18     # http://www.cnblogs.com/fnng/p/3238685.html
    19     # 打开火狐浏览器
    20     # browser = webdriver.Chrome()
    21     browser = webdriver.Firefox()
    22 
    23     # 设置浏览器大小
    24     browser.set_window_size(1200, 900)
    25     # 输入网址
    26     browser.get("http://m.1kkk.com/vol1-6871/")
    27     # 根据各自网速来判断网址加载时间
    28     time.sleep(10)
    29 
    30     for i in range(10000):
    31 
    32         # 关掉广告
    33         browser.find_element_by_class_name("ad_cross").click()
    34 
    35         # 翻页到最后面
    36         browser.execute_script("""
    37             (function () {
    38                 var y = 0;
    39                 var step = 100;
    40                 window.scroll(0, 0);
    41 
    42                 function f() {
    43                     if (y < document.body.scrollHeight) {
    44                         y += step;
    45                         window.scroll(0, y);
    46                         setTimeout(f, 100);
    47                     } else {
    48                         window.scroll(0, 0);
    49                         document.title += "scroll-done";
    50                     }
    51                 }
    52 
    53                 setTimeout(f, 1000);
    54             })();
    55         """)
    56         print("下拉中...")
    57         #time.sleep(180)
    58         while True:
    59             if "scroll-done" in browser.title:
    60                 break
    61             else:
    62                 print("还没有拉到最底端...")
    63                 time.sleep(10)
    64 
    65         # while True:
    66         #     # 判断是否存在这个东西
    67         #     select = browser.find_element_by_xpath('//a[@class="readTipForm"]')
    68         #     if select:
    69         #         break
    70         #     else:
    71         #         print("还没有拉到最底端...")
    72         #         time.sleep(60)
    73 
    74         print("正在下载图片中...")
    75         # 图片的命名
    76         name = validateTitle(browser.current_url)
    77         print("正在截图...")
    78         time.sleep(5)
    79 
    80         # 截图
    81         browser.save_screenshot("../jpg/cartoon/" + str(i + 1) + str(name) + ".png")
    82         time.sleep(5)
    83 
    84         # 点击阅读下一章
    85         browser.find_element_by_class_name("readTipForm").click()
    86         print("准备进入下一章...")
    87         time.sleep(5)
    88 
    89     browser.quit()
    90 
    91 
    92 if __name__ == '__main__':
    93     getimg()
  • 相关阅读:
    ACM ICPC 2008–2009 NEERC MSC A, B, C, G, L
    POJ 1088 滑雪 DP
    UVA 11584 最短回文串划分 DP
    POJ 2531 Network Saboteur DFS+剪枝
    UVa 10739 String to Palindrome 字符串dp
    UVa 11151 Longest Palindrome 字符串dp
    UVa 10154 Weights and Measures dp 降维
    UVa 10271 Chopsticks dp
    UVa 10617 Again Palindrome 字符串dp
    UVa 10651 Pebble Solitaire 状态压缩 dp
  • 原文地址:https://www.cnblogs.com/TTyb/p/5831601.html
Copyright © 2011-2022 走看看