zoukankan      html  css  js  c++  java
  • 人工智能—爬虫

    抓取图片

    # 导包
    import requests
    import re
    from lxml import etree
    import os


    # 定义请求类
    class PearVideo(object):

      # /定义抓取方法
      def get_countent(self,url,type):

        if type == 'index':
          file_name = 'test_pear.html'
        else:
          file_name = 'inner_pear.html'
     
        # 使用os模块判断文件是否存在
        if not os.path.exists('test_pear.html'):
     
          # 发送HTTP请求
          r = requests.get(url)
     
          # 解码
          html = r.content.decode("utf-8")
          print(html)
     
          # 第二种解码方案
          # html = r.content.decode('utf-8').encode('gbk',)
     
          # 断点
          # exit(-1)
     
          # 写文件,指定文件编码
          with open('./'+file_name,'w',encoding='utf-8') as f:
            f.write(html)
          return html
     
        else:
          # 读取文件返回
          with open('./'+file_name,encoding='utf-8') as f:
     
            contents = f.read()
          return contents
     
       # 定义数据匹配方法
      def get_xpath(self,html):

        # 匹配图片地址
        html_data_img = html.xpath('//div[@class="img"]/@style')
        print(html_data_img)

        # 处理图片
        img_list = []
        for item in html_data_img:
          # item = item.replace('background-image:url(','').replace(');','')
          # 使用正则匹配处理图片
          # 定义正则表达式  
          regex = re.compile("background-image: url((.+?));")
          img_list.append(regex.findall(item)[0])
          # 图片下载
     
        for item in img_list:
          r = requests.get(item)
          # 写文件
          with open('./test_pear.png','wb') as f:
            f.write(r.content)
          # 断点
          exit(-1)
     
        print(img_list)
     

    if __name__ == "__main__":
        # 实例化对象
        pearvideo = PearVideo()
        # 调用内置方法
        html = pearvideo.get_countent('https://www.pearvideo.com/',type)
        # print(html)
        pearvideo.get_xpath(html)
     


    抓取视频

    # 导包
    import requests
    import re
    from lxml import etree
    import os


    # 定义请求类
    class PearVideo(object):

      # /定义抓取方法
      def get_countent(self,url,type):

        if type == 'index':
          file_name = 'test_pear.html'
        else:
          file_name = 'inner_pear.html'
     
        # 使用os模块判断文件是否存在
        if not os.path.exists('test_pear.html'):
     
          # 发送HTTP请求
          r = requests.get(url)
     
          # 解码
          html = r.content.decode("utf-8")
          print(html)
     
          # 第二种解码方案
          # html = r.content.decode('utf-8').encode('gbk',)
     
          # 断点
          # exit(-1)
     
          # 写文件,指定文件编码
          with open('./'+file_name,'w',encoding='utf-8') as f:
            f.write(html)
          return html
     
        else:
          # 读取文件返回
          with open('./'+file_name,encoding='utf-8') as f:
     
            contents = f.read()
          return contents
     
       # 定义数据匹配方法
      def get_xpath(self,html):
        # 转换格式
        html = etree.HTML(html)

        # 匹配内页地址
        html_data_url = html.xpath("//div[@class='actcontbd']/a/@href")
        print(html_data_url)

        # 处理内页网址
        url_list = []
        for item in html_data_url:
          item = 'https://www.pearvideo.com/'+ item
          url_list.append(item)
        print(url_list)
     
        # 爬取内页
        url_page = url_list[8]
        # print(url_list[8])
        # for item in url_list:
        inner_html = self.get_countent(url_page,'inner')

        # 匹配真实视频地址
        regex = re.compile('srcUrl="(.+?)"')
        print(regex.findall(inner_html))


        # 下载视频 追加
        r = requests.get(regex.findall(inner_html)[0])
        with open("./test_pear.mp4",'ab') as f:
        f.write(r.content)



    if __name__ == "__main__":
      # 实例化对象
      pearvideo = PearVideo()
      # 调用内置方法
      html = pearvideo.get_countent('https://www.pearvideo.com/',type)
      # print(html)
      pearvideo.get_xpath(html)
     

    多线程爬虫结构

    # 导包
    import threading
    import requests
    import time

    # 定义线程容器
    threads = []


    # 定义计时器方法
    def get_time():
      ms = time.ctime()
      return ms

    # 定义抓取方法
    def get_content(url):
      r = requests.get(url)
      print(r.status_code)


    # 定义多线程
    for item in range(20):
      mytherad = threading.Thread(target=get_content,args=('https://www.pearvideo.com',))
      threads.append(mytherad)



    if __name__ == "__main__":
      print('开始于:%s' % get_time())

      # 同步请求
      # for x in range(20):
        # get_content('https://www.pearvideo.com')

      # 开始异步多线程请求
      for t in threads:
        # 守护线程
        t.setDaemon(True)
        t.start()
      # 打印每一个线程的执行时间
      print('这一个执行到:%s'% get_time())
      # 阻塞一下主线程
      t.join()


      print('结束于:%s' % get_time())
     
     
     
    多线程爬取视频
    # 导入requests网络请求模块
    import requests
    # 导入lxml标签匹配模块
    from lxml import etree
    # 导入re 正则匹配模块
    import re
    #导入系统路径模块
    import os
    # 导入进程模块
    import multiprocessing
    import threading
     
     
    # 存在视频网址
    mylist = []

    # 请求函数
    def Data(url):
      #发送请求
      test = requests.get(url)
     
      # with open('./pa.html','w') as pa:
      # pa.write(test.text.encode('gbk','ignore').decode('gbk','ignore'))
     
      # 返回二进制流
      return test.content

    # 匹配标签函数
    def Pipa():
      # 调用请求函数
      res = Data('https://www.pearvideo.com/category_10')
     
      # 利用etree完整HTML数据
      html = etree.HTML(res)
     
      # 匹配标签
      url = html.xpath('//*[@id="categoryList"]/li')
     
      # 循环匹配到的标签进行操作
      for i in url:
     
        # 拼接完整的详情页网址
        data = 'https://www.pearvideo.com/' + str(i.xpath('./div/a/@href')[0])
     
        # 添加到准备好的列表内
        mylist.append(data)

    # 定义好写入方法
    def xiangqing(url):
     
        # 获取url切片用于视频名称
        name = str(url).split('/')[-1] + '.mp4'
        print(name)
     
        # 调用请求方法获取详情页
        res = Data(url)
        #由于视频网址不在标签里而是在Jquery内所有没办法使用xpath 这里使用re匹配视频源所在网址
        url = re.findall('srcUrl="(.*?)",vdoUrl=srcUrl,skinRes="//www.pearvideo.com/domain/skin",videoCDN="//video.pearvideo.com";',
    str(res))[0]
     
        # 调用请求方法把视频所在的网址放进去获取资源
        res = Data(url)
     
        # 设置保存下载视频的路径
        path = "video/"
     
        # 判断路径是否存在
        if not os.path.exists(path):
     
        # 不存在则创建
        os.makedirs(path)
     
        #写入
        with open(path + name, "wb") as f:
          f.write(res)


    if __name__ == '__main__':
      #调用进行添加列表
      Pipa()
      # print(mylist)
      # for i in mylist:
      # xiangqing(i)
      for i in mylist:
        a = threading.Thread(target=xiangqing, args=(i,))
        a.start()
        a.join()
     
  • 相关阅读:
    获取某个文件夹下面的子文件夹(要求是第一级)
    操作手册
    GWT与GXT
    eclipse中出现:The project cannot be built until build path errors are resolved
    eclipse部署项目要做的工作及配置
    如何测试tomcat安装成功
    tomcat的安装及eclipse配置
    配置jdk
    oracle数据库的安装、完全卸载与plsql的安装以及与oracle的连接
    UVA
  • 原文地址:https://www.cnblogs.com/chengdongzi/p/10490744.html
Copyright © 2011-2022 走看看