zoukankan      html  css  js  c++  java
  • 爬取妹子图

    本文转自 https://blog.csdn.net/baidu_35085676/article/details/68958267 

    文中的代码,我自己跑了一遍,主要的解析的方式用的是 BeautifulSoup  但是代码跑起来可能会出现一些问题 TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。个人觉得应该是网站的反爬虫机制的问题,可以尝试一下,变换ip地址。

      1 import requests
      2 from bs4 import BeautifulSoup
      3 import os
      4 import time
      6 all_url = 'http://www.mzitu.com'
      7 #http请求头
      8 Hostreferer = {
      9     'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
     10     'Referer': 'http://www.mzitu.com'
     11                }
     12 Picreferer = {
     13     'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
     14     'Referer': 'http://i.meizitu.net'
     15 }
     16 #此请求头破解盗链
     17 #保存地址
     18 path = 'E:/pythonFile/meititu/mei/'
     19 #记录文件
     20 data = 'E:/pythonFile/meititu/mei/.data'
     21 #读取保存记录
     22 def get_log(file):
     23     page = 1
     24     line = 0
     25     try:
     26         with open(file, 'r') as f:
     27             l = f.readline()
     28             page, line = [int(i) for i in l.split('|')]
     29     except Exception as e:
     30         print(e)
     31         print('读取记录失败,从初始开始')
     32     return page, line
     34 #保存记录
     35 def put_log(file, page, line):
     36     try:
     37         with open(file, "w") as f:
     38             f.write('{}|{}'.format(page, line))
     39     except Exception as e:
     40         print('保存记录失败:[{}]'.format(e))
     42 #找寻最大页数
     43 def find_max_page():
     44     start_html = requests.get(all_url, headers=Hostreferer)
     45     soup = BeautifulSoup(start_html.text, "html.parser")
     46     page = soup.find_all('a', class_='page-numbers')
     47     max_page = page[-2].text
     48     max_page = int(max_page)
     49     return max_page
     51 if __name__ == "__main__":
     52     same_url = 'http://www.mzitu.com/page/'
     53     max_page = find_max_page()
     54     page, line = get_log(data)
     55     print('从{}页,{}行开始缓存'.format(page, line))
     56     for n in range(page, int(max_page)+1):
     57         ul = same_url+str(n)
     58         start_html = requests.get(ul, headers=Hostreferer)
     59         soup = BeautifulSoup(start_html.text, "html.parser")
     60         all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')
     61         for lines in range(line, len(all_a)):
     62             a = all_a[lines]
     63             title = a.get_text() #提取文本
     64             if(title != ''):
     65                 print("准备扒取:"+title)
     66                 #win不能创建带?的目录
     67                 if(os.path.exists(path+title.strip().replace('?',''))):
     68                         #print('目录已存在')
     69                         flag = 1
     70                 else:
     71                     os.makedirs(path+title.strip().replace('?',''))
     72                     flag = 0
     73                 os.chdir(path + title.strip().replace('?', ''))
     74                 href = a['href']
     75                 html = requests.get(href, headers=Hostreferer)
     76                 mess = BeautifulSoup(html.text, "html.parser")
     77                 # 最大也在class='pagenavi'div中的第6个span
     78                 pic_max = mess.find("div", class_='pagenavi').find_all('span')
     79                 print(pic_max)
     80                 print(len(pic_max)) #确定最大页数在第几个span标签,网页可能会变动
     81                 pic_max = pic_max[6].text #最大页数
     82                 print(pic_max)
     83                 if(flag == 1 and len(os.listdir(path+title.strip().replace('?',''))) >= int(pic_max)):
     84                     print('已经保存完毕,跳过')
     85                     continue
     86                 for num in range(1, int(pic_max)+1):
     87                     while True:
     88                         pic = href+'/'+str(num)
     89                         html = requests.get(pic, headers=Hostreferer)
     90                         mess = BeautifulSoup(html.text, "html.parser")
     91                         pic_url = mess.find('img', alt=title)
     92                         if(pic_url):
     93                             break
     94                     # print(pic_url['src'])
     95                     html = requests.get(pic_url['src'], headers=Picreferer)
     96                     file_name = pic_url['src'].split(r'/')[-1]
     97                     f = open(file_name, 'wb')
     98                     f.write(html.content)
     99                     f.close()
    100                 put_log(data, n, lines)
    101                 time.sleep(0.5)
    102         print('',n,'页完成')
    103         line = 0
    104         time.sleep(10)
  • 相关阅读:
    Postgresql HStore 插件试用小结
    postgres-xl 安装与部署 【异常处理】ERROR: could not open file (null)/STDIN_***_0 for write, No such file or directory
    GPDB 5.x PSQL Quick Reference
    postgresql 数据库schema 复制
    hive 打印日志
    gp与 pg 查询进程
    jquery table 发送两次请求 解惑
    python 字符串拼接效率打脸帖
    postgresql 日期类型处理实践
    IBM Rational Rose软件下载以及全破解方法
  • 原文地址:https://www.cnblogs.com/tianqianlan/p/11332724.html
Copyright © 2011-2022 走看看