zoukankan      html  css  js  c++  java
  • 爬取www.mmjpg.com网站图片,你懂得哦!

     1 #!/usr/bin/env python
     2 #-*-conding:utf-8-*-
     3 import requests
     4 import random
     5 import subprocess
     6 import urllib.request
     7 from bs4 import BeautifulSoup
     8 import sys
     9 import threading
    10 from concurrent.futures.process import ProcessPoolExecutor
    11 sys.setrecursionlimit(1000000)
    12 
    13 class obj(object):
    14     """ """
    15     def __init__(self):
    16         self.number = 1
    17 
    18     def startpage(self,url,end):
    19         code = ''
    20         for i in range(5):
    21             a = chr(random.randint(97, 122))
    22             b = random.randint(1, 9)
    23             uuid = random.choice([a, b])
    24             code += str(uuid)
    25         try:
    26             response = requests.get(url)
    27             response.encoding = 'utf8'
    28             html = response.text
    29             soup = BeautifulSoup(html,'html.parser')
    30             tag = soup.find(name='div',id='content')
    31             nexturl = tag.find(name='a').attrs.get('href')
    32             image = tag.find(name='a').find(name='img')
    33             imageurl = image.attrs.get('src')
    34             headers = {
    35             'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    36             'Content-Type':'image/jpeg',
    37             'Host':'img.mmjpg.com',
    38             'Referer':nexturl,
    39             'If-None-Match':'59a96b74-%s'%code,
    40             }
    41             userinfo = image.attrs.get('alt')
    42             userinfomation = userinfo.split(' ')[0]
    43             userinfos = userinfo.replace(' ','')
    44             getnum = int(imageurl.split('/')[5].split('.')[0])
    45             #print (userinfo)
    46             rs = requests.get(imageurl,headers=headers)
    47             if getnum == 1:
    48                 self.mkdir(userinfomation)
    49                 self.number = 1
    50             self.getimage(rs.content,userinfomation,userinfos)
    51             print (imageurl,userinfomation)
    52             url = nexturl
    53             self.number += 1
    54             if nexturl.split('/')[-2] != str(end):
    55                 self.startpage(url,end)
    56         except Exception as e:
    57             print (e)
    58 
    59     def getimage(self,url,name,num):
    60         try:
    61             iminfo = "H:\temp\%s\%s.jpg"%(name,num)
    62             status,resp = subprocess.getstatusoutput('dir %s'%iminfo)
    63             f = open(iminfo,'wb')
    64             f.write(url)
    65             f.close()
    66         except Exception as e:
    67             print (e)
    68 
    69 
    70     def mkdir(self,dir):
    71         status,result = subprocess.getstatusoutput("dir H:\temp\%s"%(dir))
    72         if status !=0:
    73             subprocess.Popen("md H:\temp\%s"%(dir),shell=True)
    74 
    75 site = obj()
    76 
    77 def main():
    78     startpage = 1097
    79     for i in range(41):
    80         endpage = startpage - 27
    81         if startpage == 17:
    82             endpage =0
    83         url ='http://www.mmjpg.com/mm/%s'%startpage
    84         t = threading.Thread(target=site.startpage,args=(url,endpage))
    85         t.start()
    86         #print (startpage,endpage)
    87         startpage -= 27
    88 if __name__ == '__main__':
    89     main()
  • 相关阅读:
    理解java的接口和抽象类
    Yum 仓库配置
    Vsftp 服务配置
    SAMBA 服务配置
    DHCP 服务配置
    dd 命令的使用
    linux 账户控制
    CentOS 系统优化
    Page Cache与Page回写
    TCP拥塞控制
  • 原文地址:https://www.cnblogs.com/zl-py/p/7491865.html
Copyright © 2011-2022 走看看