zoukankan      html  css  js  c++  java
  • 使用python做一个爬虫GUI程序

     整体思路和之前的一篇博客爬虫豆瓣美女一致,这次加入了图片分类,同时利用tkinter模块做成GUI程序

    效果如下:

    整体代码如下:

      1 # -*- coding:utf-8 -*-
      2 
      3 import requests
      4 from requests.exceptions import RequestException
      5 import tkinter as tk
      6 from tkinter import ttk
      7 from bs4 import BeautifulSoup
      8 import bs4
      9 from tkinter import *
     10 from tkinter.filedialog import askdirectory
     11 import os
     12 
     13 class DB():
     14     def __init__(self):
     15         self.window = tk.Tk()  #创建window窗口
     16         self.window.title("Crawler Pics")  # 定义窗口名称
     17         # self.window.resizable(0,0)  # 禁止调整窗口大小
     18         self.menu = ttk.Combobox(self.window,width=6)
     19         self.path = StringVar()
     20         self.lab1 = tk.Label(self.window, text = "目标路径:")
     21         self.lab2 = tk.Label(self.window, text="选择分类:")
     22         self.lab3 = tk.Label(self.window, text="爬取页数:")
     23         self.page = tk.Entry(self.window, width=5)
     24         self.input = tk.Entry(self.window, textvariable = self.path, width=80)  # 创建一个输入框,显示图片存放路径
     25         self.info = tk.Text(self.window, height=20)   # 创建一个文本展示框,并设置尺寸
     26 
     27         self.menu['value'] = ('大胸妹','小翘臀', '黑丝袜', '美腿控', '有颜值','大杂烩')
     28         self.menu.current(0)
     29 
     30         # 添加一个按钮,用于选择图片保存路径
     31         self.t_button = tk.Button(self.window, text='选择路径', relief=tk.RAISED, width=8, height=1, command=self.select_Path)
     32         # 添加一个按钮,用于触发爬取功能
     33         self.t_button1 = tk.Button(self.window, text='爬取', relief=tk.RAISED, width=8, height=1,command=self.download)
     34         # 添加一个按钮,用于触发清空输出框功能
     35         self.c_button2 = tk.Button(self.window, text='清空输出', relief=tk.RAISED,width=8, height=1, command=self.cle)
     36 
     37     def gui_arrang(self):
     38         """完成页面元素布局,设置各部件的位置"""
     39         self.lab1.grid(row=0,column=0)
     40         self.lab2.grid(row=1, column=0)
     41         self.menu.grid(row=1, column=1,sticky=W)
     42         self.lab3.grid(row=2, column=0,padx=5,pady=5,sticky=tk.W)
     43         self.page.grid(row=2, column=1,sticky=W)
     44         self.input.grid(row=0,column=1)
     45         self.info.grid(row=3,rowspan=5,column=0,columnspan=3,padx=15,pady=15)
     46         self.t_button.grid(row=0,column=2,padx=5,pady=5,sticky=tk.W)
     47         self.t_button1.grid(row=1,column=2)
     48         self.c_button2.grid(row=0,column=3,padx=5,pady=5,sticky=tk.W)
     49 
     50     def get_cid(self):
     51         category = {
     52             'DX': 2,
     53             'XQT': 6,
     54             'HSW': 7,
     55             'MTK': 3,
     56             'YYZ': 4,
     57             'DZH': 5
     58         }
     59         cid = None
     60         if self.menu.get() == "大胸妹":
     61             cid = category["DX"]
     62         elif self.menu.get() == "小翘臀":
     63             cid = category["XQT"]
     64         elif self.menu.get() == "黑丝袜":
     65             cid = category["HSW"]
     66         elif self.menu.get() == "美腿控":
     67             cid = category["MTK"]
     68         elif self.menu.get() == "有颜值":
     69             cid = category["YYZ"]
     70         elif self.menu.get() == "大杂烩":
     71             cid = category["DZH"]
     72         return cid
     73 
     74     def select_Path(self):
     75         """选取本地路径"""
     76         path_ = askdirectory()
     77         self.path.set(path_)
     78 
     79     def get_html(self, url, header=None):
     80         """请求初始url"""
     81         response = requests.get(url, headers=header)
     82         try:
     83             if response.status_code == 200:
     84                 # print(response.status_code)
     85                 # print(response.text)
     86                 return response.text
     87             return None
     88         except RequestException:
     89             print("请求失败")
     90             return None
     91 
     92     def parse_html(self, html, list_data):
     93         """提取img的名称和图片url,并将名称和图片地址以字典形式返回"""
     94         soup = BeautifulSoup(html, 'html.parser')
     95         img = soup.find_all('img')
     96         for t in img:
     97             if isinstance(t, bs4.element.Tag):
     98                 # print(t)
     99                 name = t.get('alt')
    100                 img_src = t.get('src')
    101                 list_data.append([name, img_src])
    102         dict_data = dict(list_data)
    103         return dict_data
    104 
    105     def get_image_content(self, url):
    106         """请求图片url,返回二进制内容"""
    107         print("正在下载", url)
    108         self.info.insert('end',"正在下载:"+url+'
    ')
    109         try:
    110             r = requests.get(url)
    111             if r.status_code == 200:
    112                 return r.content
    113             return None
    114         except RequestException:
    115             return None
    116 
    117     def download(self):
    118         base_url = 'https://www.dbmeinv.com/index.htm?'
    119         for i in range(1, int(self.page.get())+1):
    120             url = base_url + 'cid=' + str(self.get_cid()) + '&' + 'pager_offset=' + str(i)
    121             # print(url)
    122             header = {
    123                 'Accept': 'text/html,application/xhtml+xml,application/xml;q = 0.9, image/webp,image/apng,*/*;q='
    124                           '0.8',
    125                 'Accept-Encoding': 'gzip,deflate,br',
    126                 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    127                 'Cache-Control': 'max-age=0',
    128                 'Connection': 'keep-alive',
    129                 'Host': 'www.dbmeinv.com',
    130                 'Upgrade-Insecure-Requests': '1',
    131                 'User-Agent': 'Mozilla/5.0(WindowsNT6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/'
    132                               '70.0.3538.102Safari/537.36 '
    133             }
    134             list_data = []
    135             html = self.get_html(url)
    136             # print(html)
    137             dictdata = self.parse_html(html, list_data)
    138 
    139 
    140             root_dir = self.input.get()
    141             case_list = ["大胸妹", "小翘臀", "黑丝袜", "美腿控", "有颜值", "大杂烩"]
    142             for t in case_list:
    143                 if not os.path.exists(root_dir + '/pics'):
    144                     os.makedirs(root_dir + '/pics')
    145                 if not os.path.exists(root_dir + '/pics/' + str(t)):
    146                     os.makedirs(root_dir + '/pics/' + str(t))
    147 
    148 
    149             if self.menu.get() == "大胸妹":
    150                 save_path = root_dir + '/pics/' + '大胸妹'
    151                 for t in dictdata.items():
    152                     try:
    153                         # file_path = '{0}/{1}.{2}'.format(save_path, t[1], 'jpg')
    154                         file_path = save_path + '/' + t[0] + 'q' + '.jpg'
    155                         if not os.path.exists(file_path):  # 判断是否存在文件,不存在则爬取
    156                             with open(file_path, 'wb') as f:
    157                                 f.write(self.get_image_content(t[1]))
    158                                 f.close()
    159                                 print('文件保存成功')
    160                     except FileNotFoundError:
    161                         continue
    162 
    163             elif self.menu.get() == "小翘臀":
    164                 save_path = root_dir + '/pics/' + '小翘臀'
    165                 for t in dictdata.items():
    166                     try:
    167                         # file_path = '{0}/{1}.{2}'.format(save_path, t[1], 'jpg')
    168                         file_path = save_path + '/' + t[0] + 'q' + '.jpg'
    169                         if not os.path.exists(file_path):  # 判断是否存在文件,不存在则爬取
    170                             with open(file_path, 'wb') as f:
    171                                 f.write(self.get_image_content(t[1]))
    172                                 f.close()
    173                                 print('文件保存成功')
    174                     except FileNotFoundError:
    175                         continue
    176 
    177             elif self.menu.get() == "黑丝袜":
    178                 save_path = root_dir + '/pics/' + '黑丝袜'
    179                 for t in dictdata.items():
    180                     try:
    181                         # file_path = '{0}/{1}.{2}'.format(save_path, t[1], 'jpg')
    182                         file_path = save_path + '/' + t[0] + 'q' + '.jpg'
    183                         if not os.path.exists(file_path):  # 判断是否存在文件,不存在则爬取
    184                             with open(file_path, 'wb') as f:
    185                                 f.write(self.get_image_content(t[1]))
    186                                 f.close()
    187                                 print('文件保存成功')
    188                     except FileNotFoundError:
    189                         continue
    190 
    191             elif self.menu.get() == "美腿控":
    192                 save_path = root_dir + '/pics/' + '美腿控'
    193                 for t in dictdata.items():
    194                     try:
    195                         # file_path = '{0}/{1}.{2}'.format(save_path, t[1], 'jpg')
    196                         file_path = save_path + '/' + t[0] + 'q' + '.jpg'
    197                         if not os.path.exists(file_path):  # 判断是否存在文件,不存在则爬取
    198                             with open(file_path, 'wb') as f:
    199                                 f.write(self.get_image_content(t[1]))
    200                                 f.close()
    201                                 print('文件保存成功')
    202                     except FileNotFoundError:
    203                         continue
    204 
    205             elif self.menu.get() == "有颜值":
    206                 save_path = root_dir + '/pics/' + '有颜值'
    207                 for t in dictdata.items():
    208                     try:
    209                         # file_path = '{0}/{1}.{2}'.format(save_path, t[1], 'jpg')
    210                         file_path = save_path + '/' + t[0] + 'q' + '.jpg'
    211                         if not os.path.exists(file_path):  # 判断是否存在文件,不存在则爬取
    212                             with open(file_path, 'wb') as f:
    213                                 f.write(self.get_image_content(t[1]))
    214                                 f.close()
    215                                 print('文件保存成功')
    216                     except OSError:
    217                         continue
    218 
    219             elif self.menu.get() == "大杂烩":
    220                 save_path = root_dir + '/pics/' + '大杂烩'
    221                 for t in dictdata.items():
    222                     try:
    223                         # file_path = '{0}/{1}.{2}'.format(save_path, t[1], 'jpg')
    224                         file_path = save_path + '/' + t[0] + 'q' + '.jpg'
    225                         if not os.path.exists(file_path):  # 判断是否存在文件,不存在则爬取
    226                             with open(file_path, 'wb') as f:
    227                                 f.write(self.get_image_content(t[1]))
    228                                 f.close()
    229                                 print('文件保存成功')
    230                     except FileNotFoundError:
    231                         continue
    232 
    233     def cle(self):
    234         """定义一个函数,用于清空输出框的内容"""
    235         self.info.delete(1.0,"end")  # 从第一行清除到最后一行
    236 
    237 
    238 def main():
    239     t = DB()
    240     t.gui_arrang()
    241     tk.mainloop()
    242 
    243 if __name__ == '__main__':
    244     main()

    关键点:

    1.如何使用tkinter调用系统路径

    2.构造url,参数化图片分类、抓取页数

    3.使用tkinter获取输入参数传给执行代码


     下面是练习的时候写的简陋版,不包含tkinter,主要是理清思路:

      1 import re
      2 import requests
      3 import os
      4 from requests.exceptions import RequestException
      5 
      6 case = str(input("请输入你要下载的图片分类:"))
      7 category = {
      8    'DX': 2,
      9    'XQT': 6,
     10    'HSW': 7,
     11    'MTK': 3,
     12    'YYZ': 4,
     13    'DZH': 5
     14 }
     15 def get_cid():
     16     cid = None
     17     if case == "大胸妹":
     18         cid = category["DX"]
     19     elif case == "小翘臀":
     20         cid = category["XQT"]
     21     elif case == "黑丝袜":
     22         cid = category["HSW"]
     23     elif case == "美腿控":
     24         cid = category["MTK"]
     25     elif case == "有颜值":
     26         cid = category["YYZ"]
     27     elif case == "大杂烩":
     28         cid = category["DZH"]
     29     return cid
     30 
     31 
     32 
     33 base_url = 'https://www.dbmeinv.com/index.htm?'
     34 url = base_url + 'cid=' + str(get_cid())
     35 r = requests.get(url)
     36 # print(r.status_code)
     37 html = r.text
     38 # print(r.text)
     39 # print(html)
     40 
     41 name_pattern = re.compile(r'<img class="height_min".*?title="(.*?)"', re.S)
     42 src_pattern = re.compile(r'<img class="height_min".*?src="(.*?.jpg)"', re.S)
     43 
     44 name = name_pattern.findall(html)  # 提取title
     45 src = src_pattern.findall(html)  # 提取src
     46 data = [name,src]
     47 # print(name)
     48 # print(src)
     49 d=[]
     50 for i in range(len(name)):
     51     d.append([name[i], src[i]])
     52 
     53 dictdata = dict(d)
     54 # for i in dictdata.items():
     55 #     print(i)
     56 
     57 def get_content(url):
     58     try:
     59         r = requests.get(url)
     60         if r.status_code == 200:
     61             return r.content
     62         return None
     63     except RequestException:
     64         return None
     65 
     66 root_dir = os.path.dirname(os.path.abspath('.'))
     67 
     68 case_list = ["大胸妹","小翘臀","黑丝袜","美腿控","有颜值","大杂烩"]
     69 for t in case_list:
     70     if not os.path.exists(root_dir+'/pics'):
     71         os.makedirs(root_dir+'/pics')
     72     if not os.path.exists(root_dir+'/pics/'+str(t)):
     73         os.makedirs(root_dir+'/pics/'+str(t))
     74 
     75 def Type(type):
     76     save_path = root_dir + '/pics/' + str(type)
     77     # print(save_path)
     78     for t in dictdata.items():
     79         try:
     80             #file_path = '{0}/{1}.{2}'.format(save_path, t[1], 'jpg')
     81             file_path = save_path + '/' + t[0]+ 'q' +'.jpg'
     82             print("正在下载: "+'"'+t[0]+'"'+t[1])
     83             if not os.path.exists(file_path):  # 判断是否存在文件,不存在则爬取
     84                 with open(file_path, 'wb') as f:
     85                     f.write(get_content(t[1]))
     86                     f.close()
     87         except FileNotFoundError:
     88             continue
     89 if case == "大胸妹":
     90     Type(case)
     91 
     92 elif case == "小翘臀":
     93     Type(case)
     94 
     95 elif case == "黑丝袜":
     96     Type(case)
     97 
     98 elif case == "美腿控":
     99     Type(case)
    100 
    101 elif case == "有颜值":
    102     Type(case)
    103 
    104 elif case == "大杂烩":
    105     Type(case)

    效果如下

  • 相关阅读:
    Spring AOP原理
    Spring Boot引入Oracle Jar
    Cookie
    资源验证
    HTTP各种特性
    高性能网络编程之IO和NIO阻塞分析
    Http协议基础及发展历史
    Http原理与实践
    设计模式-回顾
    http://jingyan.baidu.com/article/fcb5aff78e6a48edab4a7146.html
  • 原文地址:https://www.cnblogs.com/hanmk/p/11192066.html
Copyright © 2011-2022 走看看