import re import urllib.request import random import os import http.server import http.client from urllib.error import URLError, HTTPError import urllib.parse proxy = [] #定义代理IP列表 def change_proxy(): #创建使用随机某个代理IP地址 proxy_ip = random.choice(proxy) proxy_support = urllib.request.ProxyHandler({"http":proxy_ip}) opener = urllib.request.build_opener(proxy_support) opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')] urllib.request.install_opener(opener) print("代理IP: %s" % proxy_ip) def url_open(url): #访问jandan.net网站,如果报错进行重新获取代理IP,最多5次 count = 0 while True: try: if count == "5": print("已经失败了5次,程序退出,重新执行") count += 1 response = urllib.request.urlopen(url) html = response.read() return html except OSError as e: print("链接出问题了,智能切换新的代理IP 出错的问题是:" + str(e)) change_proxy() continue except urllib.error.URLError as u: print("链接出问题了,智能切换新的代理IP 出错的问题是:" + str(u)) change_proxy() continue except (http.client.BadStatusLine,http.client.IncompleteRead) as h: print("链接出问题了,智能切换新的代理IP 出错的问题是:" + str(h)) change_proxy() continue def get_pagenum(url): #获取jandan网站的页面号(2305) html = url_open(url).decode("utf-8") num_re = re.compile(r'<spansclass="current-comment-page">[d{4}]</span>') num = num_re.search(html) a = re.compile(r'd{4}') num_b = a.search(num.group()) return num_b.group() def get_imgurl(url): #获取图片的地址 img = [] html = url_open(url).decode("utf-8") jpg_re = re.compile(r'<img src="//ww.*.jpg') numurl = jpg_re.findall(html) jpg = re.compile(r'//ww.+.jpg') for line in numurl: imgurl = jpg.findall(line) img.append(imgurl[0]) return img def save_img(img): #保存图片 i = 0 for each in img: i += 1 filename = each.split('/')[-1] with open(filename,'wb') as f: imgpage = url_open("http:%s" %each) f.write(imgpage) print("下载本页的第%s张图片,名称为%s" %(i,filename)) def get_proxy(): #从IP代理网站上抓取代理IP,存入Proxy列表中 head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'} req = urllib.request.Request(url="http://www.xicidaili.com",headers=head) response = urllib.request.urlopen(req) html = response.read().decode("utf-8") IP = re.compile(r'''<trsclass=.+>s+ <tds.+</td>s+ <td>.+</td>s+ <td>.+</td>s+ <td>.+</td>s+ <tds.+?</td>s+ <td>.+</td>s+ <td>.+</td>s+ <td>.+</td>s+ </tr> ''',re.VERBOSE) proxy_ip = IP.findall(html) for num in range(len(proxy_ip)): protocol_list = proxy_ip[num].split() protocol = protocol_list[-4].split(">") HTTP = protocol[1].split("<") PORT_list = proxy_ip[num].split() PORT = PORT_list[8].split(">") PO = PORT[1].split("<") ip_list = proxy_ip[num].split() ip = ip_list[7].split(">") IP = ip[1].split("<") if HTTP[0] == "HTTP": IP_list = IP[0]+":"+PO[0] proxy.append(IP_list) return proxy def download(dir,url): if not os.path.isdir(dir): os.mkdir(dir) os.chdir(dir) else: os.chdir(dir) url = url page_num = int(get_pagenum(url)) for i in range(10): page_num -= 1 pageurl = url + "page-" + str(page_num) + "#comments" imgurl = get_imgurl(pageurl) print("下载第%s页图片" % page_num) saveimg = save_img(imgurl) if __name__ == "__main__": get_proxy() change_proxy() dir = "ooxx" url = "http://jandan.net/ooxx/" download(dir,url)