zoukankan      html  css  js  c++  java
  • python

    Python爬取彼岸桌面

    https://blog.csdn.net/Zhangguohao666/article/details/105131503



    安装pip工具包 cd C:Program Files (x86)Microsoft Visual StudioSharedPython37_64Scripts>下执行

    pip install -i https://mirrors.aliyun.com/pypi/simple  selenium 

    1  pip install 是安装命令

    2  -i https://mirrors.aliyun.com/pypi/simple 为指定的安装镜像网址

    3 【selenium 为需要安装的工具包】

     

      1 import json
      2 import os
      3 import random
      4 import sys
      5 from concurrent.futures.thread import ThreadPoolExecutor
      6 import pymysql
      7 from lxml import etree
      8 from requests import *
      9 from selenium import webdriver
     10 from selenium.webdriver.common.by import By
     11 from selenium.webdriver.support.ui import WebDriverWait
     12 from selenium.webdriver.support import expected_conditions as EC
     13 from selenium.webdriver.chrome.options import Options
     14 
     15 class Spider:
     16     def __init__(self, cookieFile=None):
     17         if cookieFile is None:
     18             self.writeCookie()
     19             self.cookies = self.readCookies("cookie.txt")
     20         else:
     21             self.cookies = self.readCookies(cookieFile)
     22 
     23         self.headers = {
     24             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
     25         }
     26         self.indexUrl = "http://pic.netbian.com"
     27         self.catelogue = self.getCatalogue()
     28         # 每天限量只能下载200张
     29         self.downCount = 0
     30         self.ddir = 'D:\桌面壁纸\'
     31 
     32     def writeCookie(self):
     33         #try:
     34             chrome_options = Options()
     35             chrome_options.add_argument('--headless')
     36             chrome = webdriver.Chrome(options=chrome_options)
     37             chrome.get("http://pic.netbian.com/e/memberconnect/?apptype=qq")
     38             ptlogin_iframe = WebDriverWait(chrome, 15).until(EC.presence_of_element_located((By.ID,"ptlogin_iframe")))
     39             chrome.switch_to.frame(ptlogin_iframe)
     40             chrome.find_element_by_id("switcher_plogin").click()
     41             # 设置qq号
     42             chrome.find_element_by_id("u").send_keys("输入您的QQ号")
     43             # 设置qq密码
     44             chrome.find_element_by_id("p").send_keys("输入您的密码")
     45             # 确定登录
     46             chrome.find_element_by_id("login_button").click()
     47             WebDriverWait(chrome, 15).until(EC.url_to_be("http://pic.netbian.com/"))
     48             with open("cookie.txt", "wt") as f:
     49                 for cookieMap in chrome.get_cookies():
     50                     k = cookieMap["name"]
     51                     v = cookieMap["value"]
     52                     f.write(k+"="+v+"
    ")
     53         #finally:
     54         #    chrome.quit()
     55 
     56     def readCookies(self, cookieFile):
     57         cookies = {}
     58         with open(cookieFile, "r", encoding="utf-8") as f:
     59             while True:
     60                 c = f.readline()
     61                 if c is not None:
     62                     c = c.strip()
     63                     if len(c) == 0:
     64                         break
     65                     else:
     66                         c = c.split("=")
     67                         cookies[c[0]] = c[1]
     68                 else:
     69                     break
     70         return cookies
     71 
     72     def reqGet(self, url):
     73         html = get(url, headers=self.headers, cookies=self.cookies).content.decode("gbk")
     74         return html
     75 
     76     def getImg(self, url):
     77         return get(url, headers=self.headers, cookies=self.cookies)
     78 
     79     def getCatalogue(self):
     80         index = self.reqGet(self.indexUrl)
     81         h = etree.HTML(index)
     82         href = h.xpath('//div[@class="classify clearfix"]/a/@href')
     83         title = h.xpath('//div[@class="classify clearfix"]/a/text()')
     84         return zip(title, href)
     85 
     86     def getRealUrl(self, href):
     87         """
     88         ('阿尔卑斯山风景4k高清壁纸3840x2160', 'http://pic.netbian.com/downpic.php?id=21953&classid=53')
     89         """
     90         dh = self.reqGet(self.indexUrl + href)
     91         h = etree.HTML(dh)
     92         dataId = h.xpath('//div[@class="downpic"]/a/@data-id')[0]
     93         title = h.xpath('//div[@class="photo-hd"]/h1/text()')[0]
     94         url = "{0}/e/extend/downpic.php?id={1}&t={2}".format(self.indexUrl, dataId, random.random())
     95         msg = self.reqGet(url)
     96         return title, self.indexUrl + json.loads(msg)['pic']
     97 
     98     def getPicUrls(self, url=None, html=None):
     99         if html is None:
    100             html = self.reqGet(url)
    101         h = etree.HTML(html)
    102         hrefs = h.xpath('//ul[@class="clearfix"]/li/a/@href')
    103         realHrefs = []
    104         for href in hrefs:
    105             realHrefs.append(self.getRealUrl(href))
    106         return realHrefs
    107 
    108     def getMaxPage(self, html):
    109         h = etree.HTML(html)
    110         pages = h.xpath('//div[@class="page"]/a/text()')
    111         return int(pages[-2].strip())
    112 
    113     def saveToDB(self, category, v, i):
    114         url = "%s%sindex_%d.html" % (self.indexUrl, v, i)
    115         if i == 1:
    116             url = "%s%sindex.html" % (self.indexUrl, v)
    117         nus = self.getPicUrls(url=url)
    118         for nu in nus:
    119             self.add(category, nu[0], nu[1])
    120 
    121     def savePicInfoToDB(self):
    122         executor = ThreadPoolExecutor(max_workers=64)
    123         for c, v in self.catelogue:
    124             html = self.reqGet(self.indexUrl + v)
    125             if not os.path.exists("%s%s" % (self.ddir, c)):
    126                 os.mkdir("%s%s" % (self.ddir, c))
    127             print("%s%s" % (self.ddir, c))
    128             maxPage = self.getMaxPage(html)
    129             for i in range(1, maxPage + 1):
    130                 executor.submit(self.saveToDB, c, v, i)
    131         executor.shutdown(wait=True)
    132 
    133     def getConn(self):
    134         conn = pymysql.Connect(
    135             host="127.0.0.1",
    136             port=3306,
    137             charset='utf8',
    138             user='root',
    139             password='toor',
    140             db='photos'
    141         )
    142         return conn
    143 
    144     def add(self, category, filename, url):
    145         try:
    146             conn = self.getConn()
    147             cursor = conn.cursor()
    148             sql = "INSERT INTO purl VALUES ('{0}', '{1}', '{2}')".format(category, filename, url)
    149             cursor.execute(sql)
    150             conn.commit()
    151             print(filename + " was added to database successfully")
    152         except:
    153             sys.stderr.write(filename + " was existed!
    ")
    154         finally:
    155             cursor.close()
    156             conn.close()
    157 
    158     def downPic(self):
    159         executor = ThreadPoolExecutor(max_workers=32)
    160         sql = "select * from purl"
    161         conn = self.getConn()
    162         cursor = conn.cursor()
    163         cursor.execute(sql)
    164         result = cursor.fetchall()
    165         for index in range(0, len(result)):
    166             if self.downCount > 200:
    167                 print("finished today, welcome come back tomorrow!")
    168                 break
    169             executor.submit(self.download, result[index])
    170         executor.shutdown(wait=True)
    171         cursor.close()
    172         conn.close()
    173 
    174     def download(self, cnu):
    175         path = "{0}{1}{2}.jpg".format(self.ddir, cnu[0], cnu[1])
    176         if os.path.exists(path) and os.path.getsize(path) > 10000:
    177             return
    178         print("download... " + path)
    179         rimg = self.getImg(cnu[2])
    180         if (rimg.status_code != 200 or len(rimg.content) <= 1024):
    181             print("invalid img!")
    182             return
    183         with open(path, "wb") as f:
    184             f.write(rimg.content)
    185             self.downCount += 1
    186         print(str(self.downCount) + ": finished!!! " + path)
    187 
    188     def start(self, hasUrlData=None):
    189         if hasUrlData is None:
    190             self.savePicInfoToDB()
    191         self.downPic()
    192 
    193 
    194 if __name__ == '__main__':
    195     spider = Spider() # 如果没有cookie
    196     # spider = Spider(cookieFile="D:") # 如果有cookie
    197     spider.start(hasUrlData=True) # 已有数据库文件,直接下载,没有数据库文件则不填
    爬彼岸原图
     1 import re
     2 import requests
     3 import os
     4 import time
     5 ''' 爬取的彼岸桌面的壁纸'''
     6 # 总页模板
     7 urls = 'http://www.netbian.com/meishi/index_{}.htm'   
     8 
     9 # 找出user-agent代理 模拟登入
    10 headers = {
    11     'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
    12     }
    13     
    14 # 创建保存图片的文件夹
    15 if not os.path.exists("D:桌面壁纸"):
    16     os.mkdir("D:桌面壁纸")
    17 
    18 # 爬取前5页
    19 for i in range(1,6):
    20 # 加入time.sleep 推迟线程 防止爬取频率过快 ,后台异常
    21     time.sleep(1)
    22 # 第一页的特殊,单独标记起来 
    23     if i == 1:
    24         main_url= 'http://www.netbian.com/meishi/index.htm'
    25 #除第一页的 
    26     else :
    27         main_url= urls.format(i)
    28     print('--------------------------')
    29 # get请求,获取每一页面的url
    30     response = requests.get(main_url, headers=headers)
    31 # 获取每一页网页代码文本信息
    32     html = response.text
    33 # 正则匹配 ,(.*?) 用于分组只显示src,不显示alt,防止乱码, .*? 全匹配  ,都是非贪婪匹配
    34     img_urls = re.findall(r'<img src="(.*?)" alt=".*?">', html)
    35     for url in img_urls:
    36 # 图片名字,对url进行截取,负索引;
    37           img_name = url.split('/')[-1]
    38 # get请求,获取页面中图片的url
    39           response = requests.get(url,headers=headers)
    40           with open("D:桌面壁纸{}".format(img_name),"wb") as file:
    41 # 保存图片
    42                file.write(response.content)
    爬彼岸高清图
    立刻行动,坚持不懈,不断学习!
  • 相关阅读:
    服务端测试之接口测试工具——postman
    服务端测试之接口测试初探
    项目()已配置为使用IIS Web服务器,但此计算机上...
    其他信息: 在分析完成之前就遇到流结尾。
    WPF 打开指定文件路径的文件资源管理器
    Power BI 可视化交互/视觉对象交互
    ASP.NET 前端Ajax获取数据并刷新
    异常详细信息: System.ArgumentException: 不支持关键字: “metadata”。
    设计模式→单例模式
    <转>记dynamic的一个小坑 -- RuntimeBinderException:“object”未包含“xxx”的定义
  • 原文地址:https://www.cnblogs.com/deng779256146/p/13535721.html
Copyright © 2011-2022 走看看