zoukankan      html  css  js  c++  java
  • 电影天堂爬虫实战

    #!/usr/bin/python3
    # -*- coding:utf-8 -*-
    # @Time:2021/8/28 22:38
    # @author: Mrwhite
    # @File:电影天堂爬虫.py
    # @DESC:
    import re
    import urllib.request,urllib.error   #制定URL,获取网页数据
    import xlwt
    from bs4 import BeautifulSoup
    
    
    def main():
        pass
        #电影天堂url
        baseurl = "https://dy.dytt8.net/index.htm"
        
        #1.爬取主页-电影名称,超链接,更新日期
        #据超链地址打开后,获取导演/主演/豆瓣评分/磁力链接/简介
        datalist = getData(baseurl)
    
        #2.保存数据excel表,根据分类插入对应sheet页
        saveData(datalist,"电影天堂电影.xls")
    
    #创建正则表达式对象,表示规则(字符串的模式)
    findLink = re.compile(r'・[<a href="/html/gndy/.*<a href="(.*?)">')   #影片链接匹配规则
    findMovieName = re.compile( r'・[<a href="/html/gndy/.*">(.*?)</a><br/>' ) #匹配电影名称
    findUpDateTime = re.compile( r'<td class="inddline" width="15%"><font color="#FF0000">(.*?)</font></td>' ) #匹配更新日期
    findDirect = re.compile( r'<br />◎导  演 (.*?)<br />' ) #匹配导演
    findActor = re.compile( r'<br />◎主  演 (.*?)<br /><br />◎标  签' ) #匹配演员
    findScore = re.compile( r'<br />◎豆瓣评分 (.*?) from' ) #匹配豆瓣评分
    findDownloadLink = re.compile( r'<a target="_blank" href="(.*?)">' ) #匹配下载链接
    findInfo = re.compile( r'◎简  介<br /><br />  (.*?)<br />' ) #相信信息
    
    def getData(baseurl):
        datalist = []
        titles,links,updateTimes,directs,actors,scores,downloadLinks,infos=[],[],[],[],[],[],[],[]
        #1.爬取网页
        html = askURl(baseurl)
        #print(html)
        # 2.解析数据
        soup=BeautifulSoup( html, "html.parser" )
        #nth-child需要替换为nth-of-type
        item = soup.select("div:nth-of-type(2) > div:nth-of-type(1) > div > div > div.co_content8")
        item = str(item)
        #print(item)
        titles = re.findall(findMovieName, item) #正则匹配标题
        #links = f'https://dy.dytt8.net/{re.findall(findLink, html)}'
        linksUnSet = re.findall(findLink, item) #正则匹配超链接并拼接完整路径
        for link in linksUnSet:
            link = f'https://dy.dytt8.net{link}'
            links.append(link)
        updateTimes = re.findall(findUpDateTime,item) #正则匹配更新实际
    
        #3.循环访问电影子链接获取:导演/主演/豆瓣评分/磁力链接/简介
        for link in links:
            #print(link)
            html=askURl(link)
            #print(html)
            directUnSet = re.findall(findDirect,html) # 正则匹配导演并处理
            if directUnSet==[]:
                directs.append("")
            else:
                direct=directUnSet[0].replace(" ","").replace("&middot;","·")
                directs.append(direct)
    
            actorsUnset = re.findall(findActor,html)  # 正则匹配主演
            if actorsUnset==[]:
                actors.append("")
            else:
                actorList = actorsUnset[0].replace("&middot;","·").replace("&nbsp;","").replace("u3000","").split("<br />")[0:3]
                actor="/".join( actorList )
                actors.append(actor)
    
            scoresUnset = re.findall(findScore,html)  # 正则匹配豆瓣评分
            if scoresUnset==[]:
                scores.append("无评分")
            else:
                score=scoresUnset[0].split("/")[0]
                scores.append(score)
    
            downloadLink = re.findall(findDownloadLink,html)  # 正则匹配磁力链接
            downloadLinks.append(downloadLink)
    
            infosUnSet = re.findall(findInfo,html) # 正则匹配简介
            if infosUnSet==[]:
                infos.append("")
            else:
                info = infosUnSet[0].replace("&middot;","·").replace("&nbsp;","").replace("&ldquo;","")
                infos.append(info)
    
        dataList=[titles, updateTimes,directs, actors, scores, downloadLinks, infos]
        #print( len( titles ), len( updateTimes ),len(links), len( directs ), len( actors ), len( scores ), len( downloadLinks ),len( infos ) )
        return dataList
    
    #得到指定一个URL的网页内容
    def askURl(url):
        #head={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
        #request = urllib.request.Request(url,headers=head)
        request = urllib.request.Request(url)
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("gb2312",errors = 'ignore')
            #print(html)
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)
        return html
    
    #保存数据
    def saveData(datalist,savepath):
        print("save......")
        book = xlwt.Workbook(encoding="utf8",style_compression=0)
        sheet = book.add_sheet("from电影天堂",cell_overwrite_ok=True)
        col = ('标题',"更新时间","导演","主演","豆瓣评分","磁力链接","简介")
        try:
            for j in range(7): #i为行,j为列
                sheet.write(0,j,col[j]) #列名
                for i in range(1,len(datalist[0])):
                    sheet.write(i,j,datalist[j][i])
                    print("datalist的",i,"",j,"列的数据为:",datalist[j][i],"成功写入")
            book.save(savepath)    #保存
        except Exception as e:
            print("datalist的",i,"",j,"列的数据为:",datalist[j][i],"写入失败")
            print(e)
    
    if __name__ == "__main__":  #当程序执行时
        #调用函数
        main()
        print("爬取完毕")

    展示效果如下:可继续添加下优化爬虫的效率

  • 相关阅读:
    SQL随机生成6位数字
    安装时提示 INSTALL_PARSE_FAILED_MANIFEST_MALFORMED 解决办法
    Windows 7 完美安装 Visual C++ 6.0
    解决js中window.location.href不工作的问题
    DataList中动态显示DIV
    Gridview、DataList、Repeater获取行索引号
    Java多jdk安装
    【CentOS】samba服务器安装与配置
    【CentOS】IBM X3650M4 IMM远程管理【转载】
    【Java】Eclipse导出jar包与javadoc
  • 原文地址:https://www.cnblogs.com/mrwhite2020/p/15203355.html
Copyright © 2011-2022 走看看