zoukankan html css js c++ java

【Python】爬取百度图片

所需要导入的包：
BeautifulSoup:
    该模块用于接收一个HTML或XML字符串，然后将其进行格式化，之后遍可以使用他提供的方法进行快速查找指定元素，
    从而使得在HTML或XML中查找指定元素变得简单。
    这里是用来获取一个页面里面的各个标签及里面的内容，我们主要用到它里面的find(),find_All()函数
requests:
    是用来获取网页信息的，也就是说，我们给它一个url，它能把这个url对应的页面信息全部反馈给我们，这时候我们在用beautifulSoup里的函数对他们进行处理
lxml :
    是一个解析器，python里有专门的解析器，html.parser,但是lxml的解析速度优于html_parser，所以建议使用lxml
re :
    正则表达式函数
os :
    系统函数

find()函数如下：

find(name,attrs,recursive,text,**wargs)

这些参数相当于过滤器一样可以进行筛选处理。

不同的参数过滤可以应用到以下情况：

    查找标签，基于name参数
    查找文本，基于text参数
    基于正则表达式的查找
    查找标签的属性，基于attrs参数
    基于函数的查找

通过标签查找

我们可以传递任何标签的名字来查找到它第一次出现的地方。找到后,find函数返回一个BeautifulSoup的标签对象。

  1 # -*- coding:utf-8
  2 import re
  3 import requests
  4 from urllib import error
  5 from bs4 import BeautifulSoup
  6 import os
  7 
  8 num = 0
  9 numPicture = 0
 10 file = ''
 11 List = []
 12 
 13 #检测图片数量函数
 14 def Find(url):
 15     global List  #设置为全局变量
 16     print('正在检测图片总数，请稍等.....')
 17     t = 0  #objURL 分页数初始值
 18     i = 1
 19     s = 0
 20     while t < 1000:
 21         Url = url + str(t) # url地址加上分页数
 22         try:
 23             Result = requests.get(Url, timeout=7) #获取到url.timeout时间为7秒.如果获取不到7秒后退出
 24         except BaseException:
 25             t = t + 60 #源代码分页数为60
 26             continue
 27         else:
 28             result = Result.text #以encoding解析返回内容。字符串方式的响应体，会自动根据响应头部的字符编码进行解码。
 29             pic_url = re.findall('"objURL":"(.*?)",', result, re.S)  # 先利用正则表达式找到图片url
 30             s += len(pic_url) #根据正则表达式循环取出图片.(根据图片的数量长度来取,其实就是统计图片的个数)
 31             if len(pic_url) == 0: #长度为0说明没有符合条件的图片了退出
 32                 break
 33             else:
 34                 List.append(pic_url) #将取出的图片存入到list中去
 35                 t = t + 60
 36     return s
 37 
 38 #推荐函数（推荐函数，主要是根据你键入的文本，在百度图片里找到相似的内容，返回给用户，类似于百度搜索的最下面）
 39 def recommend(url):
 40     Re = []
 41     try:
 42         html = requests.get(url) #获取url
 43     except error.HTTPError as e:
 44         return
 45     else:
 46         html.encoding = 'utf-8'  #html解码格式为utf-8
 47         bsObj = BeautifulSoup(html.text, 'html.parser') #html.text 根据encoding定义的code返回内容. html.parser 是解析器
 48         div = bsObj.find('div', id='topRS') # 通过find()函数获取标签<div id="topRS">
 49         if div is not None:
 50             listA = div.findAll('a')  #获取子标签 find_All（）返回的是一个list find()直接返回结果
 51             for i in listA:
 52                 if i is not None:
 53                     Re.append(i.get_text())
 54         return Re
 55 
 56 #下载图片函数
 57 def dowmloadPicture(html, keyword):
 58     global num
 59     # t =0
 60     pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # 先利用正则表达式找到图片url
 61     print('找到关键词:' + keyword + '的图片，即将开始下载图片...')
 62     for each in pic_url:
 63         print('正在下载第' + str(num + 1) + '张图片，图片地址:' + str(each))
 64         try:
 65             if each is not None:
 66                 pic = requests.get(each, timeout=7)
 67             else:
 68                 continue
 69         except BaseException:
 70             print('错误，当前图片无法下载')
 71             continue
 72         else:
 73             string = file + r'\' + keyword + '_' + str(num) + '.jpg'
 74             fp = open(string, 'wb')
 75             fp.write(pic.content)
 76             #text 返回的是unicode 型的数据，一般是在网页的header中定义的编码形式。
 77             # content返回的是bytes，二级制型的数据。也就是说你如果想要提取文本就用text.但是如果你想要提取图片、文件，就要用到content
 78             fp.close()
 79             num += 1
 80         if num >= numPicture:
 81             return
 82 
 83 
 84 if __name__ == '__main__':  # 主函数入口
 85     word = input("请输入搜索关键词(可以是人名，地名等): ")
 86     # add = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E5%BC%A0%E5%A4%A9%E7%88%B1&pn=120'
 87     url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
 88     tot = Find(url)
 89     Recommend = recommend(url)  # 记录相关推荐
 90     print('经过检测%s类图片共有%d张' % (word, tot))
 91     numPicture = int(input('请输入想要下载的图片数量 '))
 92     file = input('请建立一个存储图片的文件夹，输入文件夹名称即可')
 93     y = os.path.exists(file)
 94     if y == 1:
 95         print('该文件已存在，请重新输入')
 96         file = input('请建立一个存储图片的文件夹，)输入文件夹名称即可')
 97         os.mkdir(file)
 98     else:
 99         os.mkdir(file)
100     t = 0
101     tmp = url
102     while t < numPicture:
103         try:
104             url = tmp + str(t)
105             result = requests.get(url, timeout=10)
106             print(url)
107         except error.HTTPError as e:
108             print('网络错误，请调整网络后重试')
109             t = t + 60
110         else:
111             dowmloadPicture(result.text, word)
112             t = t + 60
113 
114     print('当前搜索结束，感谢使用')
115     print('猜你喜欢')
116     for re in Recommend:
117         print(re, end='  ')

查看全文

相关阅读:
Ubuntu 18.04安装gcc、g++ 4.8
Java 接口返回值集合防止空指针
 Linux CentOS7.9环境下搭建Java Web 环境
 Springboot集成UReport2
linux 环境中单独执行 python 脚本
 sql 注入的问题
 检验上传文件的大小
 Gunicorn使用讲解
 CentOS下安装部署对象存储服务MinIO
阿里云CentOS7安装MySQL

原文地址：https://www.cnblogs.com/liupengpengg/p/12302285.html