zoukankan      html  css  js  c++  java
  • 备份

    # -*- coding: utf-8 -*-
    """
    Created on Mon Jan 14 18:23:10 2019
    
    @author: Administrator
    """
    
    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    #import xlwt
    
    
    
    
    
    '''
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)
                    
    sheet = book.add_sheet('test', cell_overwrite_ok=True)
                    
    sheet.write(0,0,link[-7:-4])
                    
    book.save(r'e:	est1.xls')
    '''
    
    #url="https://www.mzitu.com/169451"
    
    my_referer = r'https://www.mzitu.com/169451'
    
    
    
    #为了获得图片链接,暂时不需要了,因为链接可以生成
    '''
    r=requests.get(url,headers={'referer':my_referer})
    r.encoding=r.apparent_encoding
    html=r.content
    soup=BeautifulSoup(html,"html.parser")
    s=soup.select("div p a")[0].img["src"]
    '''
    
    #从链接总获得图片
    
    
    
    
    #s=soup.select(".article-content")
    
    #type(s[0])
    #Out[18]: bs4.element.Tag
    
    #t=s[0].get_text()
    
    
    #f=open("d:/测试解析文档学习.html","w",encoding="utf-8")
    #f.write(str(s))
    '''
    a='https://i.meizitu.net/2019/01/13d'
    
    b="https://i.meizitu.net/2018/12/29d"
    
    c="https://i.meizitu.net/2017/01/01b"
    d="https://i.meizitu.net/2017/01/02b"
    ls=[]
    ls=[a,b,c,d]
    '''
    
    
    
    
    
    """
    p1=["0"+str(i) for i in range(1,10)]    #快速列表生成器
    p1.append("10","11","12")  
    """
    
    
     
    site="https://i.meizitu.net/"     #2018
    
    year=[site+str(i)+"/" for i in range(2015,2020)]    #产生这几年
    
    
                                  #1-31天
        
        
    #p2=[chr(i) for i in range(97,123)]          #百度python生成a-z
    
    #        for j in p2:
    
    #经过分析大多数图片都是 https://i.meizitu.net/2018/12/28a01.jpg 末尾的字母主要是 a b c 所以,为了效率,节省点时间吧
    
    
    
    
    def  nyr(y):
        p0=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,13)]   #
        p1=["0"+str(i) for i in range(1,10)]+[str(i) for i in range(10,32)]   #产生31天    
        
        url_day=[]
        for k in p0:    
            for i in p1:
                    url_day.append(y+k+'/'+i)     #产生某一天
    
        return url_day
    #这样遍历的全年的不太好用,还不如老老实实爬某一个月的
    """
    p12="https://i.meizitu.net/2018/10/"
    url_Nov=[]
    for k in p1:    
        for i in p2: 
             url_Nov.append(p12+k+i)
    
    
    """
    
    header = {
    "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)",  
    "Referer": r'https://www.mzitu.com/169451'
    }  
    
    
    
    #headers={'referer':my_referer}
    
    
    def downloud(url):
        #p2=["a","b","c"]
        p2=[chr(i) for i in range(97,123)]  
        for j in p2:
            for i in range(1,60):
                if i<10:
                    link=url+j+"0"+str(i)+".jpg"
                else:
                    link=url+j+str(i)+".jpg"
                    
                
                try:
                    r1=requests.get(link,timeout=0.1,headers=header)
                    r1.raise_for_status()
                    html1=r1.content
                    #ss=str(i)
                    f=open("f:/爬虫生成文件/2015-/"+link[-17:-4].replace("/","")+".jpg","wb")
                    f.write(html1)
                    f.close()
                except:
                    k=open("f:/爬虫生成文件/爬虫字母统计.txt","a",encoding="utf-8")
                    k.write(link[-7:-4]+",")
                
                
                
                    
    
                    print("不存在:","{:^10}".format(link))
                    #k.close()
                    break
                
    
    def main():
        start_time=time.time()
        for j in range(len(year)):
            n=nyr(year[j])
            for i in range(len(n)):
                downloud(n[i])
        end_time=time.time()
        print("{:10}".format(end_time-start_time))
        
        
        
    main()
    
    
    
    
    
    
    
    """
    
    
    
    def main():
        for i in url_Nov[:200]:
            downloud(i)    
            
    """
    
    
    """
    
    对空文件测试
    x
    Out[107]: 'https://i.meizitu.net/2018/12/12o1'
    
    r1=requests.get(x,headers={'referer':my_referer})
    
    r1
    Out[109]: <Response [404]>
    
    
    明白了这里必须要加上try except ,,raisefor status的原因了
    
    
    
    url[-11:].replace("/","")
    """         
           
    
            

  • 相关阅读:
    Oracle 手工创建awr快照,获取报告
    Oracle块修改跟踪功能
    Oracle 反向索引(反转建索引) 理解
    oracle聚簇表的理解 (转自:https://blog.csdn.net/gumengkai/article/details/51009345 )
    Fix the iOS code signing issue when using Jenkins
    ios系统crash文件分析
    python版本管理
    python requirements.txt
    android makefile protoc路径添加
    初识tflite源码
  • 原文地址:https://www.cnblogs.com/xinqidian/p/10272389.html
Copyright © 2011-2022 走看看