zoukankan      html  css  js  c++  java
  • 爬小草




     1 import requests
     2 from bs4 import BeautifulSoup as bs
     3 import re
     4 import os
     5 import socket
     6 import time
     7 import threading
     8 
     9 
    10 def url_open(url):
    11     socket.setdefaulttimeout(20)
    12     headers = {
    13         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'}
    14     res = requests.get(url, headers=headers)
    15 
    16     return res
    17 
    18 
    19 def save(url):  # 传入每个子网页链接
    20     res = url_open(url)
    21     res.encoding = 'gbk'
    22     soup = bs(res.text, 'lxml')
    23     title = soup.find('title').text.split('-')[0]  # 标题
    24     #os.mkdir(title)
    25     # os.chdir(title)
    26     temp = soup.find_all('tr', class_='tr3')
    27     img = re.findall(r'data-src="(.*?jpg)" type', str(temp))
    28     
    29     
    30     imglist = []
    31 
    32     for each in img:
    33         imglist.append(each)
    34     for each in imglist:
    35         filename = each.split('/')[-1]
    36         img = url_open(each)
    37         print('saving...+%s'%filename)
    38 
    39         with open(title+filename, 'wb')as f:
    40             f.write(img.content)
    41     #os.chdir('..')
    42 
    43 
    44 
    45 
    46 if __name__ == '__main__':
    47     os.makedirs('1024', exist_ok=True)
    48     os.chdir('1024')
    49     url = 'https://cl.e7s.win/thread0806.php?fid=16&search=&page=1'  #默认爬取第一个页面,毕竟要注意身体,需要多个页面的话,自己加个for循环也不是什么难事~
    50     urlhead = 'https://cl.e7s.win/'   #页面解析出来的连接前面需要加上这个头才能打开,根据多年经验这个头是会变的,如果哪天不能用了自己看下是不是这个头变了
    51     res = url_open(url)
    52     res.encoding = 'gbk'
    53 
    54     '''找到页面中的所有子网页'''
    55     soup = bs(res.text, 'lxml')
    56     temp = soup.find_all('td', class_="tal")
    57     link = []
    58     for each in temp:
    59         link.append(urlhead + each.h3.a.get('href'))
    60     # del link[0:10]
    61 
    62     downloads = []
    63     for each in link:
    64         print(each)
    65 
    66         down = threading.Thread(target=save, args=[each])
    67         downloads.append(down)
    68 
    69         down.start()
    70     for each in downloads:
    71         each.join()
    72     print('Done')
  • 相关阅读:
    浅析TCP /UDP/ IP协议
    大小端模式
    小技巧—计算内存
    浅谈启发式合并
    浅谈换根DP
    POJ 3585 Accumulation Degree
    OSGi类加载问题
    Redis缓存集群方案
    Tair分布式缓存
    Tedis:淘宝的Redis的Java客户端开发包
  • 原文地址:https://www.cnblogs.com/zhangrenguo/p/10512162.html
Copyright © 2011-2022 走看看