zoukankan      html  css  js  c++  java
  • 美剧爬虫

    #!/usr/bin/python
    #-*- coding:utf-8 -*-

    import urllib.request
    from urllib import parse
    from lxml import etree
    import ssl
    from PyQt5.QtWidgets import QApplication, QWidget, QLineEdit, QTextEdit, QVBoxLayout, QPushButton, QMessageBox
    import sys

    # 取消代理验证
    ssl._create_default_https_context = ssl._create_unverified_context

    class TextEditMeiJu(QWidget):
    def __init__(self, parent=None):
    super(TextEditMeiJu, self).__init__(parent)
    # 定义窗口头部信息
    self.setWindowTitle('美剧天堂')
    # 定义窗口的初始大小
    self.resize(500, 600)
    # 创建单行文本框
    self.textLineEdit = QLineEdit()
    # 创建一个按钮
    self.btnButton = QPushButton('确定')
    # 创建多行文本框
    self.textEdit = QTextEdit()
    # 实例化垂直布局
    layout = QVBoxLayout()
    # 相关控件添加到垂直布局中
    layout.addWidget(self.textLineEdit)
    layout.addWidget(self.btnButton)
    layout.addWidget(self.textEdit)
    # 设置布局
    self.setLayout(layout)
    # 将按钮的点击信号与相关的槽函数进行绑定,点击即触发
    self.btnButton.clicked.connect(self.buttonClick)

    # 点击确认按钮
    def buttonClick(self):
    # 爬取开始前提示一下
    start = QMessageBox.information(
    self, '提示', '是否开始爬取《' + self.textLineEdit.text() + "》",
    QMessageBox.Ok | QMessageBox.No, QMessageBox.Ok
    )
    # 确定爬取
    if start == QMessageBox.Ok:
    self.page = 1
    self.loadSearchPage(self.textLineEdit.text(), self.page)
    # 取消爬取
    else:
    pass

    # 加载输入美剧名称后的页面
    def loadSearchPage(self, name, page):
    # 将文本转为 gb2312 编码格式
    name = parse.quote(name.encode('gb2312'))
    # 请求发送的 url 地址
    url = "https://www.meijutt.com/search/index.asp?page=" + str(page) + "&searchword=" + name + "&searchtype=-1"
    # 请求报头
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"}
    # 发送请求
    request = urllib.request.Request(url, headers=headers)
    # 获取请求的 html 文档
    html = urllib.request.urlopen(request).read()
    # 对 html 文档进行解析
    text = etree.HTML(html)
    # xpath 获取想要的信息
    pageTotal = text.xpath('//div[@class="page"]/span[1]/text()')
    # 判断搜索内容是否有结果
    if pageTotal:
    self.loadDetailPage(pageTotal, text, headers)
    # 搜索内容无结果
    else:
    self.infoSearchNull()

    # 加载点击搜索页面点击的本季页面
    def loadDetailPage(self, pageTotal, text, headers):
    # 取出搜索的结果一共多少页
    pageTotal = pageTotal[0].split('/')[1].rstrip("页")
    # 获取每一季的内容(剧名和链接)
    node_list = text.xpath('//a[@class="B font_14"]')
    items = {}
    items['name'] = self.textLineEdit.text()
    # 循环获取每一季的内容
    for node in node_list:
    # 获取信息
    title = node.xpath('@title')[0]
    link = node.xpath('@href')[0]
    items["title"] = title
    # 通过获取的单季链接跳转到本季的详情页面
    requestDetail = urllib.request.Request("https://www.meijutt.com" + link, headers=headers)
    htmlDetail = urllib.request.urlopen(requestDetail).read()
    textDetail = etree.HTML(htmlDetail)
    node_listDetail = textDetail.xpath('//div[@class="tabs-list current-tab"]//strong//a/@href')
    self.writeDetailPage(items, node_listDetail)
    # 爬取完毕提示
    if self.page == int(pageTotal):
    self.infoSearchDone()
    else:
    self.infoSearchContinue(pageTotal)

    # 将数据显示到图形界面
    def writeDetailPage(self, items, node_listDetail):
    for index, nodeLink in enumerate(node_listDetail):
    items["link"] = nodeLink
    # 写入图形界面  权利的游戏
    self.textEdit.append(
    "<div>"
    "<font color='black' size='3'>" + items['name'] + "</font>" + " "
    "<font color='red' size='3'>" + items['title'] + "</font>" + " "
    "<font color='orange' size='3'>第" + str(index + 1) + "集</font>" + " "
    "<font color='green' size='3'>下载链接:</font>" + " "
    "<font color='blue' size='3'>" + items['link'] + "</font>"
    "<p></p>"
    "</div>"
    )

    # 搜索不到结果的提示信息
    def infoSearchNull(self):
    QMessageBox.information(
    self, '提示', '搜索结果不存在,请重新输入搜索内容',
    QMessageBox.Ok, QMessageBox.Ok
    )

    # 爬取数据完毕的提示信息
    def infoSearchDone(self):
    QMessageBox.information(
    self, '提示', '爬取《' + self.textLineEdit.text() + '》完毕',
    QMessageBox.Ok, QMessageBox.Ok
    )

    # 多页情况下是否继续爬取的提示信息
    def infoSearchContinue(self, pageTotal):
    end = QMessageBox.information(
    self, '提示', '爬取第' + str(self.page) + '页《' + self.textLineEdit.text() + '》完毕,还有' + str(int(pageTotal) - self.page) + '页,是否继续爬取',
    QMessageBox.Ok | QMessageBox.No, QMessageBox.No
    )
    if end == QMessageBox.Ok:
    self.page += 1
    self.loadSearchPage(self.textLineEdit.text(), self.page)
    else:
    pass


    if __name__ == '__main__':
    app = QApplication(sys.argv)
    win = TextEditMeiJu()
    win.show()
    sys.exit(app.exec_())

    ed2k://|file|权力的游戏.Game.of.Thrones.S06E10.End.中英字幕.WEB-HR.AAC.1024X576.x264.mp4|665375958|c3d7ab76dcd3de975e2b847adaea24a0|h=7h3vzzenle2c2bh64lxqbxch7sel3vvp|/

  • 相关阅读:
    RunLoop详解
    NSURLConnection基本用法(苹果原生)
    NSThread 基本使用
    NSOperation/NSOperationQueue详细使用介绍
    JSON解析
    XML解析
    GCD详细用法
    [Java]手动编译Java
    [Selenium] Grid 介绍
    [SoapUI] 循环遍历某个Test Case下的所有Test Step,将Cookie传递给这些Test Step
  • 原文地址:https://www.cnblogs.com/beyonne/p/10681380.html
Copyright © 2011-2022 走看看