zoukankan      html  css  js  c++  java
  • 第六篇

    环境:python3  pycharm

    模块:requests  bs4  urlretrieve  os  time

    第一步:获取网页源代码

    import requests
    from bs4 import BeautifulSoup
    from urllib.request import urlretrieve
    import os
    import time
    def get_html(url):
        try:
            response = requests.get(url)
            response.encoding = 'gbk'
            return response.text
        except Exception as e:
            print(e)
    if __name__ == '__main__':
        url = 'http://www.521609.com/meinvxiaohua/'
        get_html(url)

    第二步:下载美女图片

    def down_show(html,page):
        try:
            soup = BeautifulSoup(html,'lxml')
            all_img = soup.find("div",class_="index_img list_center").find_all('img')
    
            num = 1
            for img in all_img:
                src = img.get('src')
                url_pic = 'http://www.521609.com' + src
                if os.path.exists('show'):
                    pass
                else:
                    os.mkdir('show')
                urlretrieve(url_pic,'./show/'+'第%s页-%s.jpg'%(page,num))
                num += 1
        except Exception as e:
            print(e)

    第三步:可选打印多少页,代码所示下载5页

    def get_pages(page):
        for i in range(121,page+121):
            url = 'http://www.521609.com/meinvxiaohua/list%d.html' % i
            html = get_html(url)
            down_show(html,i-120)
            time.sleep(1)
        print("图片下载完毕")
    if __name__ == '__main__':
        get_pages(5)

    也可以采用多线程

    import requests
    from bs4 import BeautifulSoup
    import threading
    import time
    import os
    
    headers = {
        'Referer': 'http://www.521609.com/meinvxiaohua/',
        'User-Agent': '',
    }
    
    def get_html(url):
        try:
            response = requests.get(url=url,headers=headers)
            response.encoding = "gb2312"
            return response.text      #文本,字符串
        except Exception as e:
            print(e)
    
    def mk_dir():
        os.makedirs('./show/',exist_ok=True)
    
    def down_image(html,page):
        try:
            soup = BeautifulSoup(html,'lxml')#可以解析html,xml
            all_img = soup.find('div',class_='index_img list_center').find_all('img')
            num = 1
            for img in all_img:
                src = img.get('src')#后半部分的地址
                url = 'http://www.521609.com' + src
                content = requests.get(url=url,headers=headers).content#字节流
                with open('./show/第%s页-%s.jpg' % (page,num),'wb') as file:
                    file.write(content)
                num += 1
                time.sleep(1)
        except Exception as e:
            print(e)
            pass
    
    def get_pages(page):
        for i in range(121,121+page):
            url = "http://www.521609.com/meinvxiaohua/list%s.html" % i
            html = get_html(url)
            if not os.path.exists('show'):
                mk_dir()
            down_image(html,page)
            time.sleep(1)
            print('美女图片前%s页下载完毕' % str(i-120))
        # if not os.path.exists('show'):
        #     mk_dir()
        # thread = []
        # for i in range(121,121+page):
        #     url = "http://www.521609.com/meinvxiaohua/list%s.html" % i
        #     html = get_html(url)
        #     t = threading.Thread(target=down_image,args=(html,str(i-120)))
        #     thread.append(t)
        # for i in thread:
        #     i.start()
        # for j in thread:
        #     j.join()
    
    
    def main():
        start_time = time.time()
        get_pages(3)
        stop_time = time.time()
        load_time = stop_time - start_time
        print(load_time)#48.115086793899536
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    Javascript 获得数组中相同或不同的数组元素
    JS 获取(期号、当前日期、本周第一天、最后一天及当前月第一、最后天函数)
    Intellij IDEA2020.2.3最新激活码激活破解方法(2020.11.26)
    【jQuery 区别】.click()和$(document).on("click","指定的元素",function(){});的区别
    pytorch repeat 和 expand 函数的使用场景,区别
    python小技巧
    提高GPU利用率
    pyinstaller 打包文件(包括使用管理员模式)
    frp 开机自启动
    AUC指标深度理解
  • 原文地址:https://www.cnblogs.com/smart-zihan/p/9498984.html
Copyright © 2011-2022 走看看