新手上路
# -*- coding: utf-8 -*- """ Created on Mon Jan 14 18:23:10 2019 @author: Administrator """ import requests from bs4 import BeautifulSoup import re #url="https://www.mzitu.com/169451" my_referer = r'https://www.mzitu.com/169451' #为了获得图片链接,暂时不需要了,因为链接可以生成 ''' r=requests.get(url,headers={'referer':my_referer}) r.encoding=r.apparent_encoding html=r.content soup=BeautifulSoup(html,"html.parser") s=soup.select("div p a")[0].img["src"] ''' #从链接总获得图片 #s=soup.select(".article-content") #type(s[0]) #Out[18]: bs4.element.Tag #t=s[0].get_text() #f=open("d:/测试解析文档学习.html","w",encoding="utf-8") #f.write(str(s)) ''' a='https://i.meizitu.net/2019/01/13d' b="https://i.meizitu.net/2018/12/29d" c="https://i.meizitu.net/2017/01/01b" d="https://i.meizitu.net/2017/01/02b" ls=[] ls=[a,b,c,d] ''' """ p1=["0"+str(i) for i in range(1,10)] #快速列表生成器 p1.append("10","11","12") """ p="https://i.meizitu.net/2018/" #2018 p0=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,13)] #月 p1=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,32)] #产生31天 #1-31天 #p2=[chr(i) for i in range(97,123)] #百度python生成a-z # for j in p2: #经过分析大多数图片都是 https://i.meizitu.net/2018/12/28a01.jpg 末尾的字母主要是 a b c 所以,为了效率,节省点时间吧 p2=["a","b","c"] url_day=[] for k in p0: for i in p1: url_day.append(p+k+'/'+i) #产生某一天 #这样遍历的全年的不太好用,还不如老老实实爬某一个月的 """ p12="https://i.meizitu.net/2018/10/" url_Nov=[] for k in p1: for i in p2: url_Nov.append(p12+k+i) """ def downloud(url): for j in p2: for i in range(1,60): if i<10: link=url+j+"0"+str(i)+".jpg" else: link=url+j+str(i)+".jpg" try: r1=requests.get(link,timeout=0.1,headers={'referer':my_referer}) r1.raise_for_status() html1=r1.content #ss=str(i) f=open("f:/爬虫生成文件/图片a-c/"+link[-17:-4].replace("/","")+".jpg","wb") f.write(html1) f.close() except: print("不存在:","{:^10}".format(link)) break def main(): for i in range(len(url_day)): downloud(url_day[i]) main() """ def main(): for i in url_Nov[:200]: downloud(i) """ """ 对空文件测试 x Out[107]: 'https://i.meizitu.net/2018/12/12o1' r1=requests.get(x,headers={'referer':my_referer}) r1 Out[109]: <Response [404]> 明白了这里必须要加上try except ,,raisefor status的原因了 url[-11:].replace("/","") """