开始我学习爬虫的目标 ----> 煎蛋网
通过设置User-Agent获取网页,发现本该是图片链接的地方被一个js函数代替了
于是全局搜索到该函数
function jandan_load_img(b){ var d = $(b); var f = d.next("span.img-hash"); var e = f.text(); f.remove(); var c = jdPwA5ybKhQWGy2rZybAF2StIBxrQ6NvYC(e, "pAqWmGD1GsmY5kVokg1a2eyQ3Shj1Usq"); var a = $('<a href = "'+c.replace(/(//w+.sinaimg.cn/)(w+)(/.+.(gif|jpg|jpeg))/, "$1large$3")+'" target = "_blank" class = "view_img_link">[查看原图]</a>'); d.before(a); d.before("<br>"); d.removeAttr("onload"); d.attr("src", location.protocol+c.replace(/(//w+.sinaimg.cn/)(w+)(/.+.gif)/, "$1thumb180$3")); if(/.gif$/.test(c)){ d.attr("org_src", location.protocol+c); b.onload = function(){ add_img_loading_mask(this, load_sina_gif) } } }
该函数提取span.img-hsah传到另一个函数中,继续查找,有两个定义,于是选择靠后的那个
1 var jdTzcXZnL0V2WZZ8eq9786xeOdkyoBXlDR=function(m,r,d){ 2 var e="DECODE"; 3 var r=r?r:""; 4 var d=d?d:0; 5 var q=4; 6 r=md5(r); 7 var o=md5(r.substr(0,16)); 8 var n=md5(r.substr(16,16)); 9 if(q){if(e=="DECODE"){var l=m.substr(0,q)}} 10 else{var l=""} 11 var c=o+md5(o+l); 12 var k; 13 if(e=="DECODE"){m=m.substr(q); 14 k=base64_decode(m)} 15 var h=new Array(256); 16 for(var g=0;g<256;g++){h[g]=g} 17 var b=new Array(); 18 for(var g=0;g<256;g++){b[g]=c.charCodeAt(g%c.length)} 19 for(var f=g=0;g<256;g++){f=(f+h[g]+b[g])%256; 20 tmp=h[g]; 21 h[g]=h[f]; 22 h[f]=tmp} 23 var t=""; 24 k=k.split(""); 25 for(var p=f=g=0; 26 g<k.length; 27 g++){p=(p+1)%256; 28 f=(f+h[p])%256; 29 tmp=h[p]; 30 h[p]=h[f]; 31 h[f]=tmp; 32 t+=chr(ord(k[g])^(h[(h[p]+h[f])%256]))} 33 if(e=="DECODE"){if((t.substr(0,10)==0||t.substr(0,10)-time()>0)&&t.substr(10,16)==md5(t.substr(26)+n).substr(0,16)){t=t.substr(26)} 34 else{t=""} 35 } 36 return t 37 };
参考文章: http://www.tendcode.com/article/jiandan-meizi-spider/
其中有对js的函数的改写
最后代码如下
1 # -*- coding = UTF-8 -*- 2 ''' 3 目标:煎蛋网妹子图 4 2018/4/22 5 环境:pyhton3 6 7 ''' 8 9 10 import urllib.request #使用url处理包,urllib.request模块是用来打开和读取URLs的 11 import re #使用正则表达式 12 import hashlib # 13 import base64 # 14 from bs4 import BeautifulSoup # 15 import time #time 16 import logging #log 17 import sys # 18 19 ''' 20 下载单张图片到制定的文件夹下 21 ''' 22 def load_img(imgurl, file): 23 name = imgurl.split('/')[-1] 24 item = urllib.request.urlretrieve('http:'+imgurl, 25 #'C:\Users\74172\Pictures\jandan2\%s'%(name)) 26 file+'\%s'%(name)) 27 print(name+' is loaded') 28 29 ''' 30 md5加密 31 ''' 32 def _md5(value): 33 m = hashlib.md5() 34 m.update(value.encode('utf-8')) 35 return m.hexdigest() 36 37 ''' 38 bash64解码 39 注意 原字符串长度报错问题 40 ''' 41 def _base64_decode(data): 42 missing_padding = 4 - len(data) % 4 43 if missing_padding: 44 data += '=' * missing_padding 45 return base64.b64decode(data) 46 47 ''' 48 解密获取图片链接 49 ''' 50 def get_imgurl(m, r='', d=0): 51 e = "DECODE" 52 q = 4 53 r = _md5(r) 54 o = _md5(r[0:0 + 16]) 55 n = _md5(r[16:16 + 16]) 56 l = m[0:q] 57 c = o + _md5(o + l) 58 m = m[q:] 59 k = _base64_decode(m) 60 h = list(range(256)) 61 b = [ord(c[g % len(c)]) for g in range(256)] 62 63 f = 0 64 for g in range(0, 256): 65 f = (f + h[g] + b[g]) % 256 66 tmp = h[g] 67 h[g] = h[f] 68 h[f] = tmp 69 70 t = "" 71 p, f = 0, 0 72 for g in range(0, len(k)): 73 p = (p + 1) % 256 74 f = (f + h[p]) % 256 75 tmp = h[p] 76 h[p] = h[f] 77 h[f] = tmp 78 t += chr(k[g] ^ (h[(h[p] + h[f]) % 256])) 79 t = t[26:] 80 return t 81 82 ''' 83 获取关键字符串 84 ''' 85 def get_r(js_url): 86 js_respon = urllib.request.urlopen(js_url) 87 js = js_respon.read().decode('utf-8') 88 _r = re.findall('c=[wd]+(e,"(.*?)")', js) 89 return _r 90 91 ''' 92 获取一个页面的所有图片的链接 93 ''' 94 def get_urls(url,pages,file): 95 page = 0 96 imagNum = 0 97 headers = { 98 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 99 'Host': 'jandan.net' 100 } 101 ######################################### 102 while page < pages: 103 req = urllib.request.Request(url, headers=headers) 104 respon = urllib.request.urlopen(req) 105 html = respon.read().decode('utf-8') 106 ########################################## 107 js_url = 'http:' + re.findall('<script src="(//cdn.jandan.net/static/min/[wd]+.d+.js)"></script>', html)[-1] 108 _r = get_r(js_url)[0] 109 soup = BeautifulSoup(html, 'lxml') 110 tags = soup.select('.img-hash') 111 for tag in tags: 112 img_hash = tag.text 113 img_url = get_imgurl(img_hash,_r) 114 print(imagNum,'------>',img_url) 115 imagNum = imagNum+1 116 load_img(img_url,file) 117 ############################################ 118 nextUrl = re.findall(r'Older Comments" href=".+?.#comments"',html)[0] 119 print('page#',90-page,'---->done!') 120 url = 'http:' + nextUrl[22:-1] 121 page += 1 122 time.sleep(10) 123 print('done all!') 124 print('located---->',file) 125 126 if __name__ == '__main__': 127 url = 'http://jandan.net/ooxx/' 128 pages = 1 129 file = 'C:\jandan_meizi' 130 get_urls(url,pages,file)
//************2018.05.03*******************************
刚下班回来爬图,发现中间有个报错,
是get_url一次返回的多张图片链接,于是的改段
1 for tag in tags: 2 img_hash = tag.text 3 img_urls = get_imgurl(img_hash,_r) 4 img_urls = re.findall(r'//wx.+?.jpg',img_urls) 5 for img_url in img_urls: 6 print(imagNum,'------>',img_url) 7 imagNum = imagNum+1 8 load_img(img_url,file)
//************2018.5.23***********************************
又报错了,暂时没有解决。