# -*- coding: utf-8 -*- """ Created on Mon Jan 14 18:23:10 2019 @author: Administrator """ import requests from bs4 import BeautifulSoup import re import time #import xlwt ''' book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('test', cell_overwrite_ok=True) sheet.write(0,0,link[-7:-4]) book.save(r'e: est1.xls') ''' #url="https://www.mzitu.com/169451" my_referer = r'https://www.mzitu.com/169451' #为了获得图片链接,暂时不需要了,因为链接可以生成 ''' r=requests.get(url,headers={'referer':my_referer}) r.encoding=r.apparent_encoding html=r.content soup=BeautifulSoup(html,"html.parser") s=soup.select("div p a")[0].img["src"] ''' #从链接总获得图片 #s=soup.select(".article-content") #type(s[0]) #Out[18]: bs4.element.Tag #t=s[0].get_text() #f=open("d:/测试解析文档学习.html","w",encoding="utf-8") #f.write(str(s)) ''' a='https://i.meizitu.net/2019/01/13d' b="https://i.meizitu.net/2018/12/29d" c="https://i.meizitu.net/2017/01/01b" d="https://i.meizitu.net/2017/01/02b" ls=[] ls=[a,b,c,d] ''' """ p1=["0"+str(i) for i in range(1,10)] #快速列表生成器 p1.append("10","11","12") """ site="https://i.meizitu.net/" #2018 year=[site+str(i)+"/" for i in range(2015,2020)] #产生这几年 #1-31天 #p2=[chr(i) for i in range(97,123)] #百度python生成a-z # for j in p2: #经过分析大多数图片都是 https://i.meizitu.net/2018/12/28a01.jpg 末尾的字母主要是 a b c 所以,为了效率,节省点时间吧 def nyr(y): p0=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,13)] #月 p1=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,32)] #产生31天 url_day=[] for k in p0: for i in p1: url_day.append(y+k+'/'+i) #产生某一天 return url_day #这样遍历的全年的不太好用,还不如老老实实爬某一个月的 """ p12="https://i.meizitu.net/2018/10/" url_Nov=[] for k in p1: for i in p2: url_Nov.append(p12+k+i) """ header = { "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)", "Referer": r'https://www.mzitu.com/169451' } #headers={'referer':my_referer} def downloud(url): #p2=["a","b","c"] p2=[chr(i) for i in range(97,123)] for j in p2: for i in range(1,60): if i<10: link=url+j+"0"+str(i)+".jpg" else: link=url+j+str(i)+".jpg" try: r1=requests.get(link,timeout=0.1,headers=header) r1.raise_for_status() html1=r1.content #ss=str(i) f=open("f:/爬虫生成文件/2015-/"+link[-17:-4].replace("/","")+".jpg","wb") f.write(html1) f.close() except: k=open("f:/爬虫生成文件/爬虫字母统计.txt","a",encoding="utf-8") k.write(link[-7:-4]+",") print("不存在:","{:^10}".format(link)) #k.close() break def main(): start_time=time.time() for j in range(len(year)): n=nyr(year[j]) for i in range(len(n)): downloud(n[i]) end_time=time.time() print("{:10}".format(end_time-start_time)) main() """ def main(): for i in url_Nov[:200]: downloud(i) """ """ 对空文件测试 x Out[107]: 'https://i.meizitu.net/2018/12/12o1' r1=requests.get(x,headers={'referer':my_referer}) r1 Out[109]: <Response [404]> 明白了这里必须要加上try except ,,raisefor status的原因了 url[-11:].replace("/","") """