zoukankan      html  css  js  c++  java
  • 爬站长之家表情包

     1 from bs4 import BeautifulSoup
     2 import os
     3 import requests
     4 
     5 #获取页面内容
     6 def getHtmlText(url, s='text'):
     7     try:
     8         r = requests.get(url, timeout=30)
     9         r.raise_for_status()
    10         r.encoding = r.apparent_encoding
    11         if s == 'text':
    12             return r.text
    13         elif s == 'content':
    14             return r.content
    15         else:
    16             return ''
    17     except:
    18         return ""
    19 
    20 
    21  #获取表情包名字与表情包套链接
    22 def getEmotionInfo(html):
    23     soup = BeautifulSoup(html, 'html.parser')
    24     emo_divs = soup.find_all('div', attrs={'class':'up'})
    25     for div in emo_divs:
    26         a = div.find('div', attrs={'class':'num_1'}).find('a')
    27         title = a.attrs['title']
    28         href = a.attrs['href']
    29         getEmotionImgInfo(title, href)
    30 
    31 #获取表情包中每一个图片的链接
    32 def getEmotionImgInfo(title, href):
    33     html = getHtmlText(href)
    34     soup = BeautifulSoup(html, 'html.parser')
    35     img_div = soup.find('div', attrs={'class':'img_text'}).next_sibling.next_sibling
    36     imgs = img_div.find_all('img')
    37     url_list = []
    38     for img in imgs:
    39         src = img.attrs['src']
    40         url_list.append(src)
    41     getImg(title, url_list)
    42 
    43 #获取表情包保存在本地
    44 def getImg(title, url_list):
    45     root = 'D://pics//' + title
    46     if not os.path.exists(root):
    47         os.mkdir(root)
    48     count_small = 0
    49     for key in url_list:
    50         path = root +'//'+ key.split('/')[-1]
    51         if not os.path.exists(path):
    52             img_content = getHtmlText(key,'content')
    53             with open(path, 'wb') as f:
    54                 f.write(img_content)
    55             count_small = count_small + 1
    56             print('
    {}文件进度:{:.2f}%'.format(title, count_small*100/len(url_list)),end=',')
    57 
    58 if __name__ == '__main__':
    59     first_url = 'http://sc.chinaz.com/biaoqing/index.html'
    60     root_url = 'http://sc.chinaz.com/biaoqing/index_'
    61 
    62 pages = 20 63 for i in range(1,pages): #切换页面爬取内容 64 if i == 1: 65 html = getHtmlText(first_url) 66 else: 67 url = root_url + str(i) + '.html' 68 html = getHtmlText(url) 69 getEmotionInfo(html)
  • 相关阅读:
    MySQL-MMM方案
    MySQL双主复制
    MySQL主从复制
    Keepalived实现高可用
    CentOS7.2 部署Haproxy 1.7.2
    博客园写随笔时用数学公式
    Java中有三种移位运算符
    VS Code配置C/C++环境
    Visual Studio Code 如何编写运行 C、C++ 程序?
    头一次知道“原地算法”?!
  • 原文地址:https://www.cnblogs.com/jp-mao/p/6759005.html
Copyright © 2011-2022 走看看