zoukankan      html  css  js  c++  java
  • 爬虫 爬取妹子图

    功能写的很差,简单练手

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    
    import hashlib
    import re
    import time
    
    import requests  # pip3 install requests
    
    movie_path = r'D:爬虫学习爬虫妹子图'
    
    
    def get_index_page(url):
        try:
            # 模拟发送get请求
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
        except Exception:
            pass
    
    
    def parse_index(index_page):
        detail_urls = re.findall('li>.*?<a href="(.*?)"', index_page, re.S)
        for detail_url in detail_urls:
            ret = detail_url.rsplit('/', maxsplit=1)[1]
            if ret:
                yield detail_url
    
    
    def get_parge_url(detail_url):
        try:
            # 模拟发送get请求
            response = requests.get(detail_url,
                                    headers={
                                        "Referer": "www.mzitu.com",
                                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                                        # 'Upgrade-Insecure-Requests': 1,
                                        # 'Cookie':'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1516079374; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1516079794'
                                    }, )
    
            if response.status_code == 200:
                return response.text
        except Exception:
            pass
    
    
    def parse_detail(detail):
        try:
            details = re.findall('<img src="(.*?)" ', detail, re.S)
            return details[0]
        except Exception as e:
            pass
    
    
    def get_movie(url,page_url):
        try:
            response = requests.get(url,
                                    headers={
                                        "Referer": page_url,   # 这里解决防盗链问题
                                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                                    },
                                    )
            if response.status_code == 200:
                m = hashlib.md5()
                m.update(str(time.time()).encode('utf-8'))
                m.update(url.encode('utf-8'))
                filepath = '%s\%s.jpg' % (movie_path, m.hexdigest())
                with open(filepath, 'wb') as f:
                    f.write(response.content)
                    print('%s 下载成功' % url)
        except Exception:
            pass
    
    
    def main():
        base_url = 'http://www.mzitu.com/xinggan/page/{0}/'
        for i in range(5):
            url = base_url.format(i)
            text = get_index_page(url)
            detail_urls = parse_index(text)
            for detail_url in detail_urls:
                detail_text = get_parge_url(detail_url)
                detail=parse_detail(detail_text)
                get_movie(detail,detail_url)
       
    
    
    if __name__ == '__main__':
        main()

    结果:

    结果

  • 相关阅读:
    ActiveReport9 在MVC4项目中出错
    EntityFramework5.0 DataBase-First 在三层架构中的使用,分离实体类到Model层。
    SqlServer存在并删除 表,函数,view等
    Visual Studio常用技巧与插件
    让 WPF 应用程序单例化
    C# 常用加密方法一 AES 与 DES
    Windows 的公共文件夹
    Hibernate中Criteria的完整用法
    maven依赖关系中Scope的作用
    Eclipse取消设置项目默认空间
  • 原文地址:https://www.cnblogs.com/supery007/p/8297599.html
Copyright © 2011-2022 走看看