zoukankan      html  css  js  c++  java
  • 爬虫 爬取妹子图

    功能写的很差,简单练手

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    
    import hashlib
    import re
    import time
    
    import requests  # pip3 install requests
    
    movie_path = r'D:爬虫学习爬虫妹子图'
    
    
    def get_index_page(url):
        try:
            # 模拟发送get请求
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
        except Exception:
            pass
    
    
    def parse_index(index_page):
        detail_urls = re.findall('li>.*?<a href="(.*?)"', index_page, re.S)
        for detail_url in detail_urls:
            ret = detail_url.rsplit('/', maxsplit=1)[1]
            if ret:
                yield detail_url
    
    
    def get_parge_url(detail_url):
        try:
            # 模拟发送get请求
            response = requests.get(detail_url,
                                    headers={
                                        "Referer": "www.mzitu.com",
                                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                                        # 'Upgrade-Insecure-Requests': 1,
                                        # 'Cookie':'Hm_lvt_dbc355aef238b6c32b43eacbbf161c3c=1516079374; Hm_lpvt_dbc355aef238b6c32b43eacbbf161c3c=1516079794'
                                    }, )
    
            if response.status_code == 200:
                return response.text
        except Exception:
            pass
    
    
    def parse_detail(detail):
        try:
            details = re.findall('<img src="(.*?)" ', detail, re.S)
            return details[0]
        except Exception as e:
            pass
    
    
    def get_movie(url,page_url):
        try:
            response = requests.get(url,
                                    headers={
                                        "Referer": page_url,   # 这里解决防盗链问题
                                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                                    },
                                    )
            if response.status_code == 200:
                m = hashlib.md5()
                m.update(str(time.time()).encode('utf-8'))
                m.update(url.encode('utf-8'))
                filepath = '%s\%s.jpg' % (movie_path, m.hexdigest())
                with open(filepath, 'wb') as f:
                    f.write(response.content)
                    print('%s 下载成功' % url)
        except Exception:
            pass
    
    
    def main():
        base_url = 'http://www.mzitu.com/xinggan/page/{0}/'
        for i in range(5):
            url = base_url.format(i)
            text = get_index_page(url)
            detail_urls = parse_index(text)
            for detail_url in detail_urls:
                detail_text = get_parge_url(detail_url)
                detail=parse_detail(detail_text)
                get_movie(detail,detail_url)
       
    
    
    if __name__ == '__main__':
        main()

    结果:

    结果

  • 相关阅读:
    Linux 下复制命令行输出内容或直接复制文本内容
    JavaScript Array contrast
    Docker安装 Mysql 8.0 并挂载外部配置和数据
    IPC 方法分类
    Linux 安装各种常用通讯软件
    Docker--关于域名和端口配置问题总结
    Golang--Directional Channel(定向通道)
    数位dp
    STL:reverse函数、upper_bound函数、lower_bound函数
    vue filter中无法访问this的解决方案
  • 原文地址:https://www.cnblogs.com/supery007/p/8297599.html
Copyright © 2011-2022 走看看