zoukankan      html  css  js  c++  java
  • 如何优雅的爬妹子网,手把手教你

     

    直接上代码,哈哈!!
    from urllib import request
    import os
    from user_agents import ua_list
    import time
    import random
    import re
    import requests
    from lxml import etree
    
    class MeiziSpider():
        def __init__(self):
            self.url = 'https://www.mzitu.com/all/'
    
        def get_html(self, url):
            headers = {'User-Agent': random.choice(ua_list)}
            req = request.Request(url=url, headers=headers)
            res = request.urlopen(req)
            html = res.read()
            return html
            # print(html)
    
        def re_func(self, re_bds, html):
            pattern = re.compile(re_bds, re.S)
            r_list = pattern.findall(html)
            return r_list
    
        # 获取想要的数据 - 解析一级页面
        # def parse_html(self, url):
        #     one_html = self.get_html(url).decode()
        #     # print(one_html)
        #     re_bds = '<p class="url">.*?<a href="(.*?)" target="_blank">(.*?)</a>'
        #     one_list = self.re_func(re_bds, one_html)
        #     # print(one_list)
        #     # time.sleep(random.randint(1, 3))
        #     self.write_html(one_list)
    
    
        def parse_html(self,url):
            html = self.get_html(url).decode()
            parse_obj = etree.HTML(html)
            href_list = parse_obj.xpath('//div[@class="all"]/ul[@class="archives"]/li/p[@class="url"]/a/@href')
            print("href_list:",href_list)
            self.write_html(href_list)
    
    
    
    
    
        def write_html(self, href_list):
            for href in href_list:
                two_url = href
                print(two_url)
                time.sleep(random.randint(1, 3))
                self.save_image(two_url)
    
        def save_image(self, two_url):
            headers = {'Referer': two_url, 'User-Agent': random.choice(ua_list)}
            print('---------two_url-----------', two_url)
            # 向图片链接发请求.得到bytes类型
            i = 0
            while True:
                try:
                    img_link = two_url + '/{}'.format(i)
                    print("img_link:", img_link)
                    html = requests.get(url=img_link, headers=headers).text
                    re_bds = ' <div class="main-image"><p><a href="https://www.mzitu.com/.*?" ><img ' 
                             'src="(.*?)" alt="(.*?)" width=".*?" height=".*?" /></a></p>'
                    img_html_list = self.re_func(re_bds, html)
                    print("img_html_list", img_html_list)
                    name = img_html_list[0][1]
                    print("-----name:",name)
                    direc = '/home/ubuntu/meizi/{}/'.format(name)
                    print("direc:",direc)
                    if not os.path.exists(direc):
                        os.makedirs(direc)
                    img_ = requests.get(url=img_html_list[0][0], headers=headers).content
                    filename = direc + name + img_link.split('/')[-1] + '.jpg'
                    # print("img_:",img_)
                    with open(filename, 'wb') as f:
                        f.write(img_)
                    i += 1
                except Exception as e:
                    break
    
    
    if __name__ == '__main__':
        spider = MeiziSpider()
        spider.parse_html('https://www.mzitu.com/all')

    大功告成。都看懂了吧。如果有不懂的,可以去小编的Python交流.裙 :一久武其而而流一思(数字的谐音)转换下可以找到了,里面有最新Python教程项目!一起交流进步吧

  • 相关阅读:
    【分享】项目开发容易出现的问题?身为前端/后端你见到过吗?
    标准化API设计的重要性
    【分享】对外API接口安全设计
    【实例】调用数据库自动生成接口代码
    【翻译】API-First是什么概念?有什么商业价值?
    保障接口安全的5种常见方式
    【翻译】使用OpenAPI规范进行安全的API设计
    为什么需要API文档
    利用java的反射,实现工厂创建对象
    Cesium入门8
  • 原文地址:https://www.cnblogs.com/chengxuyuanaa/p/12002605.html
Copyright © 2011-2022 走看看