zoukankan      html  css  js  c++  java
  • python爬虫 妹子图片网

    代码如下

    #coding=utf-8
    import os
    import re
    import urllib
    from time import sleep
    
    import requests
    from lxml import etree
    
    
    host = "http://www.mzitu.com"
    
    category = ['xinggan']
    
    start_page = 124973
    end_page = start_page + 1
    
    def validateTitle(title):
       rstr = r"[/\:*?"<>|]"  # '/  : * ? " < > |'
       new_title = re.sub(rstr, "_", title)  # 替换为下划线
       return new_title
    
    
    
    
    
    def save_img(img,dir_path,file_name):
        headers = {"Referer": "http://www.mzitu.com","User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
        file_content = requests.get(img,headers=headers)
        if file_content.status_code != 200:
            print(img, "下载失败")
        else:
            #urllib.request.urlretrieve(img, dir_path + file_name)
            with open(dir_path + file_name, "wb") as f:
                f.write(file_content.content)
            print("保存图片" + dir_path + file_name + "成功")
    
    def get_html(url,page):
        sleep(5)
        new_url = url+"/"+str(page)
        headers = {"Referer": "http://www.mzitu.com","User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
        response = requests.get(new_url,headers=headers)
        print(response.headers)
        html = etree.HTML(response.content)
        title = html.xpath("/html/body/div[2]/div[1]/h2/text()")
        img_url = html.xpath("/html/body/div[2]/div[1]/div[3]/p/a/img/@src")
        if len(title) >0 and len(img_url) >0:
            title = validateTitle(title[0])
            surfix = os.path.splitext(img_url[0])[1]
            title = title + surfix
    
            dir_path = "/www/spider/images/"
            print(dir_path+title)
            print(img_url)
            save_img(img_url[0],dir_path,title)
    
    
    
    
    
    try:
        for i in range(start_page, int(end_page)):
            url = host + '/' + str(i)
            headers = {"Referer":"http://www.mzitu.com","User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
            response = requests.get(url,headers=headers)
            print(url)
            print(response.headers)
            if response.status_code == 200:
                html = etree.HTML(response.content)
                total_page = html.xpath("/html/body/div[2]/div[1]/div[4]/a[5]/span/text()")
                if len(total_page) > 0:
                    for i in range(1,int(total_page[0]) + 1):
                        get_html(url,i)
            # 获取总页数
    
    except Exception as e:
        print(str(e))
    
  • 相关阅读:
    ubuntu 16.04 网络配置之虚拟网卡的配置
    rabbitmq集群节点操作
    Ubuntu system zabbix-server-3.x install documentation
    PS RSS
    proxy_set_header设置Host为$proxy_host,$host与$local_host的区别
    centos 7 free 字段含义
    Linux atop 监控系统状态
    谨慎调整内核参数:vm.min_free_kbytes
    nginx反向代理http与https两种协议配置简述
    Python 获取以毫秒为单位的时间戳
  • 原文地址:https://www.cnblogs.com/brady-wang/p/8922411.html
Copyright © 2011-2022 走看看