zoukankan      html  css  js  c++  java
  • 爬取7160美女图片

    #coding=utf-8
    
    import urllib.request
    from bs4 import BeautifulSoup
    from urllib import error
    import re
    ls = ['zhenrenxiu','meinv',"lianglichemo",'rentiyishu','xiaohua']
    def validateTitle(title):
        rstr = r"[/\:*?"<>|]"  # '/  : * ? " < > |'
        new_title = re.sub(rstr, "_", title)  # 替换为下划线
        return new_title
    
    for j in range(1,60000):
       url_origin = "http://www.7160.com/xiaohua/"+str(j)
       try:
          page_obj = urllib.request.urlopen(url_origin)
          page_soup = BeautifulSoup(page_obj,'lxml')
          total_page_obj = page_soup.find(text=re.compile('')).string
          pattern = re.compile(r'd+')
          match = pattern.search(total_page_obj)
    
          if match == None:
             total_page = 0;
          else:
             total_page = match.group();
    
          for i in range(1,int(total_page)):
             if i == 1 :
                url = url_origin+"/index.html"
             else:
                url = url_origin+"/index_"+str(i)+".html"
             request = urllib.request.Request(url)
             try:
                res = urllib.request.urlopen(request)
    
                soup = BeautifulSoup(res,'lxml')
                title_obj = soup.find(attrs={"class":"picmainer"})
    
                if title_obj is not None:
                   print(url)
                   title = title_obj.h1.string
                   content = soup.find('img')
                   src = content.get("src")
    
                   file_name = validateTitle(title)+".jpg"
                   urllib.request.urlretrieve(src, "D://img2/"+file_name)
                   print(file_name+"保存成功")
             except Exception  as e:
                print("异常"+str(j))
       except Exception  as e:
                print("异常"+str(j))
  • 相关阅读:
    反射之初认识
    面向对象(上)练习一 改进:调用方法
    关于php中id设置自增后不连续的问题
    由于定界符引出的格式错误问题
    PHP 关于timezone问题
    2016.4.29 园子第一天,希望所有的坚持都有所收获
    递归调用
    动手动脑
    界面实验任务
    课程作业02
  • 原文地址:https://www.cnblogs.com/brady-wang/p/8321709.html
Copyright © 2011-2022 走看看